Spaces:
Sleeping
Sleeping
Better UI with progressbar and download button
Browse files
app.py
CHANGED
@@ -40,15 +40,6 @@ audio_file = st.file_uploader("🔼 Upload your audio file:", type=constants.SUP
|
|
40 |
|
41 |
print(audio_file,'is the upload')
|
42 |
|
43 |
-
# if audio_file is not None:
|
44 |
-
# # Check the duration of the uploaded audio file
|
45 |
-
# duration = get_audio_duration(audio_file)
|
46 |
-
|
47 |
-
# # Allow only files up to 5 minutes (300 seconds)
|
48 |
-
# if duration > 300:
|
49 |
-
# st.error("The uploaded audio file exceeds the 5-minute limit. Please upload a shorter file.")
|
50 |
-
# else:
|
51 |
-
# st.success(f"Audio file uploaded successfully! Duration: {duration/60:.2f} minutes")
|
52 |
|
53 |
if audio_file:
|
54 |
# Reset states only when a new file is uploaded
|
@@ -69,7 +60,7 @@ if audio_file:
|
|
69 |
result = client.audio.transcriptions.create(
|
70 |
file=(audio_file.name, file_bytes), # Send the audio file content directly to the API
|
71 |
model="whisper-large-v3-turbo", # Model to use for transcription
|
72 |
-
prompt="
|
73 |
response_format="verbose_json", # Return detailed JSON response
|
74 |
temperature=0.0, # Control randomness in the transcription output
|
75 |
)
|
@@ -115,35 +106,45 @@ if audio_file:
|
|
115 |
|
116 |
# Generate images only if they have not been generated already
|
117 |
if st.session_state.image_prompts and not st.session_state.generated_images:
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
|
|
129 |
|
130 |
# Generate video when all images are generated
|
131 |
if st.session_state.generated_images and st.session_state.audio:
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
st.success("Video generated successfully!")
|
143 |
|
144 |
# Display the generated video
|
145 |
if st.session_state.generated_video:
|
146 |
st.video(st.session_state.generated_video)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
else:
|
149 |
st.warning("Please upload an audio file to proceed.")
|
|
|
40 |
|
41 |
print(audio_file,'is the upload')
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
if audio_file:
|
45 |
# Reset states only when a new file is uploaded
|
|
|
60 |
result = client.audio.transcriptions.create(
|
61 |
file=(audio_file.name, file_bytes), # Send the audio file content directly to the API
|
62 |
model="whisper-large-v3-turbo", # Model to use for transcription
|
63 |
+
prompt="Take Note of Overall Context of the Audio", # Optional context for better transcription accuracy
|
64 |
response_format="verbose_json", # Return detailed JSON response
|
65 |
temperature=0.0, # Control randomness in the transcription output
|
66 |
)
|
|
|
106 |
|
107 |
# Generate images only if they have not been generated already
|
108 |
if st.session_state.image_prompts and not st.session_state.generated_images:
|
109 |
+
progress_placeholder = st.empty()
|
110 |
+
progress_bar = st.progress(0)
|
111 |
+
total_images = len(st.session_state.image_prompts)
|
112 |
+
progress_placeholder.text(f"Generating images. Please be patient...")
|
113 |
+
|
114 |
+
for idx, (prompt, image_path) in enumerate(generate_images(st.session_state.image_prompts)):
|
115 |
+
st.session_state.generated_images.append((prompt, image_path))
|
116 |
+
progress = (idx + 1) / total_images
|
117 |
+
progress_bar.progress(progress)
|
118 |
+
progress_placeholder.text(f"Generated image {idx + 1} of {total_images}: {prompt[:50]}...")
|
119 |
+
|
120 |
+
progress_placeholder.text("✅ All images generated successfully!")
|
121 |
+
progress_bar.empty()
|
122 |
|
123 |
# Generate video when all images are generated
|
124 |
if st.session_state.generated_images and st.session_state.audio:
|
125 |
+
with st.spinner("Generating video... Please wait."):
|
126 |
+
# Map images to segments
|
127 |
+
image_paths = [img[1] for img in st.session_state.generated_images]
|
128 |
+
generated_video_path = generate_video(
|
129 |
+
audio_file=st.session_state.audio,
|
130 |
+
images=image_paths,
|
131 |
+
segments=st.session_state.segments
|
132 |
+
)
|
133 |
+
st.session_state.generated_video = generated_video_path
|
134 |
+
st.success("Video generated successfully!")
|
|
|
135 |
|
136 |
# Display the generated video
|
137 |
if st.session_state.generated_video:
|
138 |
st.video(st.session_state.generated_video)
|
139 |
+
|
140 |
+
# Add a download button for the generated video
|
141 |
+
with open(st.session_state.generated_video, "rb") as file:
|
142 |
+
st.download_button(
|
143 |
+
label="Download Video",
|
144 |
+
data=file,
|
145 |
+
file_name="generated_video.mp4",
|
146 |
+
mime="video/mp4"
|
147 |
+
)
|
148 |
|
149 |
else:
|
150 |
st.warning("Please upload an audio file to proceed.")
|
utils.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
|
2 |
import requests
|
3 |
import constants
|
4 |
import os
|
@@ -54,33 +53,7 @@ def get_translation(text: str):
|
|
54 |
print(f"An exception occurred: {e}")
|
55 |
return {"error_occured" : e}
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
def old_get_image_prompts(text_input):
|
60 |
-
headers = {
|
61 |
-
"Authorization": f"Bearer {constants.HF_TOKEN}", # Replace with your token
|
62 |
-
"Content-Type": "application/json" # Optional, ensures JSON payload
|
63 |
-
}
|
64 |
-
|
65 |
-
endpoint = f"{constants.PROMPT_GENERATION_ENDPOINT}"
|
66 |
-
payload = {"text_input": text_input}
|
67 |
-
|
68 |
-
try:
|
69 |
-
# Send the POST request
|
70 |
-
print("making post request for image prompts", endpoint)
|
71 |
-
response = requests.post(endpoint, json=payload, headers=headers)
|
72 |
-
|
73 |
-
# Raise an exception for HTTP errors
|
74 |
-
response.raise_for_status()
|
75 |
-
|
76 |
-
# Parse JSON response
|
77 |
-
result = response.json()
|
78 |
-
return result
|
79 |
-
|
80 |
-
except requests.exceptions.RequestException as e:
|
81 |
-
print(f"Error during request: {e}")
|
82 |
-
return {"error": str(e)}
|
83 |
-
|
84 |
def segments_to_chunks(segments):
|
85 |
chunks = []
|
86 |
for segment in segments:
|
@@ -98,7 +71,7 @@ def get_image_prompts(text_input : List):
|
|
98 |
extractor = StructuredOutputExtractor(response_schema=ImagePromptResponseSchema)
|
99 |
chunks_count = len(text_input)
|
100 |
chunks = "chunk: " + "\nchunk: ".join(text_input)
|
101 |
-
prompt = f"""ROLE: You are a Highly Experienced Image Prompt Sythesizer
|
102 |
TASK: Generate {chunks_count} image prompts, Each per chunk\n\n {chunks}"""
|
103 |
result = extractor.extract(prompt)
|
104 |
return result.model_dump() # returns dictionary version pydantic model
|
@@ -158,62 +131,15 @@ def tmp_folder(folder_name: str) -> str:
|
|
158 |
|
159 |
|
160 |
|
161 |
-
|
162 |
-
print(f"images: {images}")
|
163 |
-
print(f"segments: {segments}")
|
164 |
-
print(f"audio file: {audio_file.name}")
|
165 |
-
try:
|
166 |
-
# Save the uploaded audio file to a temporary location
|
167 |
-
file_extension = os.path.splitext(audio_file.name)[1]
|
168 |
-
temp_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=f"{file_extension}")
|
169 |
-
temp_audio_path.write(audio_file.read())
|
170 |
-
temp_audio_path.close()
|
171 |
-
|
172 |
-
# Load the audio file using MoviePy
|
173 |
-
audio = mp.AudioFileClip(temp_audio_path.name)
|
174 |
-
audio_duration = audio.duration
|
175 |
-
|
176 |
-
# Create video clips for each segment using the corresponding image
|
177 |
-
video_clips = []
|
178 |
-
for i, segment in enumerate(segments):
|
179 |
-
start_time = segment["start"]
|
180 |
-
end_time = segment["end"]
|
181 |
-
|
182 |
-
# Ensure the image index is within bounds
|
183 |
-
image_path = images[min(i, len(images) - 1)]
|
184 |
-
|
185 |
-
# Create an ImageClip for the current segment
|
186 |
-
image_clip = ImageClip(image_path, duration=end_time - start_time)
|
187 |
-
image_clip = image_clip.set_start(start_time).set_end(end_time)
|
188 |
-
video_clips.append(image_clip)
|
189 |
-
|
190 |
-
# Concatenate all the image clips to form the video
|
191 |
-
video = mp.concatenate_videoclips(video_clips, method="compose")
|
192 |
-
|
193 |
-
# Add the audio to the video
|
194 |
-
video = video.set_audio(audio)
|
195 |
-
|
196 |
-
# Save the video to a temporary file
|
197 |
-
temp_dir = tempfile.gettempdir()
|
198 |
-
video_path = os.path.join(temp_dir, "generated_video.mp4")
|
199 |
-
video.write_videofile(video_path, fps=24, codec="libx264", audio_codec="aac")
|
200 |
-
|
201 |
-
# Clean up the temporary audio file
|
202 |
-
os.remove(temp_audio_path.name)
|
203 |
|
204 |
-
return video_path
|
205 |
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
|
211 |
-
from moviepy.editor import *
|
212 |
|
213 |
def generate_video(audio_file, images, segments):
|
214 |
-
print(f"images: {images}")
|
215 |
-
print(f"segments: {segments}")
|
216 |
-
print(f"audio file: {audio_file.name}")
|
217 |
try:
|
218 |
# Save the uploaded audio file to a temporary location
|
219 |
file_extension = os.path.splitext(audio_file.name)[1]
|
@@ -223,36 +149,58 @@ def generate_video(audio_file, images, segments):
|
|
223 |
|
224 |
# Load the audio file using MoviePy
|
225 |
audio = AudioFileClip(temp_audio_path.name)
|
226 |
-
audio_duration = audio.duration
|
227 |
|
228 |
-
# Define YouTube-like dimensions (16:9 aspect ratio
|
229 |
-
frame_width =
|
230 |
-
frame_height =
|
231 |
|
232 |
-
# Create video clips for each segment using the corresponding image
|
233 |
video_clips = []
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
# Ensure the image index is within bounds
|
239 |
image_path = images[min(i, len(images) - 1)]
|
240 |
-
|
241 |
# Create an ImageClip for the current segment
|
242 |
-
image_clip = ImageClip(image_path
|
243 |
-
|
244 |
# Resize and pad the image to fit a 16:9 aspect ratio
|
245 |
image_clip = image_clip.resize(height=frame_height).on_color(
|
246 |
size=(frame_width, frame_height),
|
247 |
color=(0, 0, 0), # Black background
|
248 |
pos="center" # Center the image
|
249 |
)
|
250 |
-
|
251 |
-
# Set the
|
252 |
-
image_clip = image_clip.
|
|
|
|
|
253 |
video_clips.append(image_clip)
|
254 |
|
255 |
# Concatenate all the image clips to form the video
|
|
|
256 |
video = concatenate_videoclips(video_clips, method="compose")
|
257 |
|
258 |
# Add the audio to the video
|
@@ -261,16 +209,22 @@ def generate_video(audio_file, images, segments):
|
|
261 |
# Save the video to a temporary file
|
262 |
temp_dir = tempfile.gettempdir()
|
263 |
video_path = os.path.join(temp_dir, "generated_video.mp4")
|
264 |
-
|
|
|
265 |
|
266 |
# Clean up the temporary audio file
|
267 |
os.remove(temp_audio_path.name)
|
|
|
268 |
|
269 |
return video_path
|
270 |
|
271 |
except Exception as e:
|
272 |
print(f"Error generating video: {e}")
|
273 |
-
return
|
|
|
|
|
|
|
|
|
274 |
|
275 |
|
276 |
# Example usage:
|
|
|
|
|
1 |
import requests
|
2 |
import constants
|
3 |
import os
|
|
|
53 |
print(f"An exception occurred: {e}")
|
54 |
return {"error_occured" : e}
|
55 |
|
56 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
def segments_to_chunks(segments):
|
58 |
chunks = []
|
59 |
for segment in segments:
|
|
|
71 |
extractor = StructuredOutputExtractor(response_schema=ImagePromptResponseSchema)
|
72 |
chunks_count = len(text_input)
|
73 |
chunks = "chunk: " + "\nchunk: ".join(text_input)
|
74 |
+
prompt = f"""ROLE: You are a Highly Experienced Image Prompt Sythesizer (try to avoid explicit unethical prompt gracefully as much as possible)
|
75 |
TASK: Generate {chunks_count} image prompts, Each per chunk\n\n {chunks}"""
|
76 |
result = extractor.extract(prompt)
|
77 |
return result.model_dump() # returns dictionary version pydantic model
|
|
|
131 |
|
132 |
|
133 |
|
134 |
+
from moviepy.editor import *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
|
|
136 |
|
137 |
+
import os
|
138 |
+
import tempfile
|
139 |
+
from moviepy.editor import AudioFileClip, ImageClip, concatenate_videoclips
|
|
|
140 |
|
|
|
141 |
|
142 |
def generate_video(audio_file, images, segments):
|
|
|
|
|
|
|
143 |
try:
|
144 |
# Save the uploaded audio file to a temporary location
|
145 |
file_extension = os.path.splitext(audio_file.name)[1]
|
|
|
149 |
|
150 |
# Load the audio file using MoviePy
|
151 |
audio = AudioFileClip(temp_audio_path.name)
|
|
|
152 |
|
153 |
+
# Define YouTube-like dimensions (16:9 aspect ratio)
|
154 |
+
frame_width = 1280
|
155 |
+
frame_height = 720
|
156 |
|
|
|
157 |
video_clips = []
|
158 |
+
total_segments = len(segments)
|
159 |
+
|
160 |
+
for i, current_segment in enumerate(segments):
|
161 |
+
start_time = current_segment["start"]
|
162 |
+
end_time = current_segment["end"]
|
163 |
+
|
164 |
+
# Calculate the actual duration including any gap until the next segment
|
165 |
+
if i < total_segments - 1:
|
166 |
+
# If there's a next segment, extend until it starts
|
167 |
+
next_segment = segments[i + 1]
|
168 |
+
actual_end_time = next_segment["start"]
|
169 |
+
else:
|
170 |
+
# For the last segment, use its end time
|
171 |
+
actual_end_time = end_time
|
172 |
+
|
173 |
+
# Calculate total duration including any gap
|
174 |
+
segment_duration = actual_end_time - start_time
|
175 |
+
|
176 |
+
print(f"\nProcessing segment {i + 1}/{total_segments}:")
|
177 |
+
print(f" Start time: {start_time}s")
|
178 |
+
print(f" Base end time: {end_time}s")
|
179 |
+
print(f" Actual end time: {actual_end_time}s")
|
180 |
+
print(f" Total duration: {segment_duration}s")
|
181 |
+
print(f" Text: '{current_segment['text']}'")
|
182 |
+
|
183 |
# Ensure the image index is within bounds
|
184 |
image_path = images[min(i, len(images) - 1)]
|
185 |
+
|
186 |
# Create an ImageClip for the current segment
|
187 |
+
image_clip = ImageClip(image_path)
|
188 |
+
|
189 |
# Resize and pad the image to fit a 16:9 aspect ratio
|
190 |
image_clip = image_clip.resize(height=frame_height).on_color(
|
191 |
size=(frame_width, frame_height),
|
192 |
color=(0, 0, 0), # Black background
|
193 |
pos="center" # Center the image
|
194 |
)
|
195 |
+
|
196 |
+
# Set the duration and start time for the clip
|
197 |
+
image_clip = image_clip.set_duration(segment_duration)
|
198 |
+
image_clip = image_clip.set_start(start_time) # Set the start time explicitly
|
199 |
+
|
200 |
video_clips.append(image_clip)
|
201 |
|
202 |
# Concatenate all the image clips to form the video
|
203 |
+
print("Concatenating video clips...")
|
204 |
video = concatenate_videoclips(video_clips, method="compose")
|
205 |
|
206 |
# Add the audio to the video
|
|
|
209 |
# Save the video to a temporary file
|
210 |
temp_dir = tempfile.gettempdir()
|
211 |
video_path = os.path.join(temp_dir, "generated_video.mp4")
|
212 |
+
print(f"Writing video file to {video_path}...")
|
213 |
+
video.write_videofile(video_path, fps=30, codec="libx264", audio_codec="aac")
|
214 |
|
215 |
# Clean up the temporary audio file
|
216 |
os.remove(temp_audio_path.name)
|
217 |
+
print("Temporary audio file removed.")
|
218 |
|
219 |
return video_path
|
220 |
|
221 |
except Exception as e:
|
222 |
print(f"Error generating video: {e}")
|
223 |
+
return None
|
224 |
+
|
225 |
+
|
226 |
+
|
227 |
+
|
228 |
|
229 |
|
230 |
# Example usage:
|