awacke1 commited on
Commit
a0010c7
Β·
verified Β·
1 Parent(s): 02cb7fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -46
app.py CHANGED
@@ -10,11 +10,26 @@ import soundfile as sf
10
  from dotenv import load_dotenv
11
  from transformers import AutoProcessor, AutoModel
12
  import torch
 
13
 
14
  # Load environment variables
15
  load_dotenv()
16
  HF_TOKEN = os.getenv("API_KEY")
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def resize(img_list):
19
  resize_img_list = []
20
  for item in img_list:
@@ -24,68 +39,103 @@ def resize(img_list):
24
  return resize_img_list
25
 
26
  def text2speech(text):
27
- # Using Microsoft's SpeechT5 model instead of FastSpeech2
28
- processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
29
- model = AutoModel.from_pretrained("microsoft/speecht5_tts")
30
-
31
- # Preprocessing text input
32
- inputs = processor(text=text, return_tensors="pt")
33
-
34
- # Generate speech with default speaker embedding
35
- speaker_embeddings = torch.zeros((1, model.config.speaker_embedding_size))
36
- speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
37
-
38
- # Save as flac file
39
- sf.write("speech_output.flac", speech.numpy(), samplerate=16000)
40
- return "speech_output.flac"
41
 
42
  def merge_audio_video(entities_num, resize_img_list, text_input):
43
- speech = text2speech(text_input)
44
- wav_audio = AudioSegment.from_file(speech, "flac")
45
- wav_audio.export("audio.mp3", format="mp3")
46
- audio_length = int(MP3("audio.mp3").info.length)
47
- fps = entities_num / audio_length
48
- fps = float(format(fps, '.5f'))
49
-
50
- clip = ImageSequenceClip(resize_img_list, fps=fps)
51
- clip.write_videofile('my_vid_tmp.mp4')
52
-
53
- videoclip = VideoFileClip('my_vid_tmp.mp4')
54
- audioclip = AudioFileClip('audio.mp3')
55
- mergedclip = videoclip.set_audio(audioclip)
56
-
57
- return mergedclip
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- with gr.Blocks() as app:
60
- # Load models in Blocks context
61
- ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large")
62
- latentdiffusion = gr.Interface.load("spaces/multimodalart/latentdiffusion")
63
 
64
- def engine(text_input):
 
65
  entities = ner(text_input)
66
  entities = [tupl for tupl in entities if None not in tupl]
67
  entities_num = len(entities)
68
- img_list = []
69
 
 
 
 
 
70
  for ent in entities:
71
  img = latentdiffusion(ent[0], '50', '256', '256', '1', 10)[0]
72
  img_list.append(img)
73
 
74
  resize_img_list = resize(img_list)
75
- mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
76
- mergedclip.write_videofile('mergedvideo.mp4')
77
 
78
- return 'mergedvideo.mp4'
 
 
 
 
 
79
 
80
- interface = gr.Interface(
81
- fn=engine,
82
- inputs=gr.Textbox(lines=5, label="Input Text"),
83
- outputs=gr.Video(label='Final Merged Video'),
84
- description="<div>🎭🎞️🍿 AI Movie Maker - Comedy 🎬 🧠 🎨</div>",
 
85
  examples=[
86
  ["Two space marines take up arms to save the planet from an alien invasion. These two dashing strong men play a comedic role in the science fiction movie of the future where even Barnaby bunny is willing to join their wacky gang of space marines to save the planet with good looks and comedy."]
87
  ],
88
- title="AI Pipeline Multi Model 🎭🎞️🍿 Movie Maker 🎬 🧠 🎨",
89
- article="<br><div></div>"
90
  )
91
- interface.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
10
  from dotenv import load_dotenv
11
  from transformers import AutoProcessor, AutoModel
12
  import torch
13
+ import tempfile
14
 
15
  # Load environment variables
16
  load_dotenv()
17
  HF_TOKEN = os.getenv("API_KEY")
18
 
19
+ def cleanup_temp_files():
20
+ temp_files = [
21
+ os.path.join(tempfile.gettempdir(), 'speech_output.flac'),
22
+ os.path.join(tempfile.gettempdir(), 'audio.mp3'),
23
+ os.path.join(tempfile.gettempdir(), 'my_vid_tmp.mp4'),
24
+ os.path.join(tempfile.gettempdir(), 'mergedvideo.mp4')
25
+ ]
26
+ for file in temp_files:
27
+ if os.path.exists(file):
28
+ try:
29
+ os.remove(file)
30
+ except:
31
+ pass
32
+
33
  def resize(img_list):
34
  resize_img_list = []
35
  for item in img_list:
 
39
  return resize_img_list
40
 
41
  def text2speech(text):
42
+ try:
43
+ processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
44
+ model = AutoModel.from_pretrained("microsoft/speecht5_tts")
45
+
46
+ inputs = processor(text=text, return_tensors="pt")
47
+ speaker_embeddings = torch.zeros((1, model.config.speaker_embedding_size))
48
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
49
+
50
+ output_path = os.path.join(tempfile.gettempdir(), "speech_output.flac")
51
+ sf.write(output_path, speech.numpy(), samplerate=16000)
52
+ return output_path
53
+ except Exception as e:
54
+ print(f"Error in text2speech: {str(e)}")
55
+ raise
56
 
57
  def merge_audio_video(entities_num, resize_img_list, text_input):
58
+ try:
59
+ speech = text2speech(text_input)
60
+ wav_audio = AudioSegment.from_file(speech, "flac")
61
+
62
+ audio_path = os.path.join(tempfile.gettempdir(), "audio.mp3")
63
+ wav_audio.export(audio_path, format="mp3")
64
+
65
+ audio_length = int(MP3(audio_path).info.length)
66
+ fps = max(entities_num / audio_length, 1) # Ensure fps is at least 1
67
+ fps = float(format(fps, '.5f'))
68
+
69
+ temp_video = os.path.join(tempfile.gettempdir(), "my_vid_tmp.mp4")
70
+ clip = ImageSequenceClip(resize_img_list, fps=fps)
71
+ clip.write_videofile(temp_video, codec='libx264', fps=fps)
72
+
73
+ videoclip = VideoFileClip(temp_video)
74
+ audioclip = AudioFileClip(audio_path)
75
+ mergedclip = videoclip.set_audio(audioclip)
76
+
77
+ output_path = os.path.join(tempfile.gettempdir(), "mergedvideo.mp4")
78
+ mergedclip.write_videofile(output_path)
79
+
80
+ # Clean up clips
81
+ videoclip.close()
82
+ audioclip.close()
83
+ mergedclip.close()
84
+
85
+ return output_path
86
+ except Exception as e:
87
+ print(f"Error in merge_audio_video: {str(e)}")
88
+ raise
89
+ finally:
90
+ cleanup_temp_files()
91
 
92
+ # Load models outside the Blocks context
93
+ ner = gr.load("huggingface/flair/ner-english-ontonotes-large")
94
+ latentdiffusion = gr.load("spaces/multimodalart/latentdiffusion")
 
95
 
96
+ def engine(text_input):
97
+ try:
98
  entities = ner(text_input)
99
  entities = [tupl for tupl in entities if None not in tupl]
100
  entities_num = len(entities)
 
101
 
102
+ if entities_num == 0:
103
+ raise ValueError("No entities found in the input text")
104
+
105
+ img_list = []
106
  for ent in entities:
107
  img = latentdiffusion(ent[0], '50', '256', '256', '1', 10)[0]
108
  img_list.append(img)
109
 
110
  resize_img_list = resize(img_list)
111
+ output_path = merge_audio_video(entities_num, resize_img_list, text_input)
 
112
 
113
+ return output_path
114
+ except Exception as e:
115
+ print(f"Error in engine: {str(e)}")
116
+ raise gr.Error(f"An error occurred: {str(e)}")
117
+ finally:
118
+ cleanup_temp_files()
119
 
120
+ with gr.Blocks() as app:
121
+ gr.Markdown("# AI Pipeline Multi Model 🎭🎞️🍿 Movie Maker 🎬 🧠 🎨")
122
+ gr.Markdown("<div>🎭🎞️🍿 AI Movie Maker - Comedy 🎬 🧠 🎨</div>")
123
+ text_input = gr.Textbox(lines=5, label="Input Text")
124
+ output_video = gr.Video(label='Final Merged Video')
125
+ examples = gr.Examples(
126
  examples=[
127
  ["Two space marines take up arms to save the planet from an alien invasion. These two dashing strong men play a comedic role in the science fiction movie of the future where even Barnaby bunny is willing to join their wacky gang of space marines to save the planet with good looks and comedy."]
128
  ],
129
+ inputs=text_input
 
130
  )
131
+ submit_button = gr.Button("Generate Video")
132
+ submit_button.click(fn=engine, inputs=text_input, outputs=output_video)
133
+
134
+ gr.Markdown("<br><div></div>")
135
+
136
+ app.launch(
137
+ debug=True,
138
+ share=True, # Enable sharing
139
+ server_name="0.0.0.0", # Listen on all interfaces
140
+ server_port=7860 # Specify port
141
+ )