Spaces:
Paused
Paused
narrative
Browse files- app.py +21 -7
- baseline_utils.py +52 -9
app.py
CHANGED
@@ -13,7 +13,8 @@ import os
|
|
13 |
openai_api_key = os.getenv("OPENAI_API_KEY")
|
14 |
google_service_account_info = json.loads(os.getenv("GOOGLE_SERVICE_ACCOUNT"))
|
15 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
16 |
-
|
|
|
17 |
# Initialize OpenAI
|
18 |
openai.api_key = openai_api_key
|
19 |
|
@@ -23,7 +24,7 @@ def get_google_credentials():
|
|
23 |
return service_account.Credentials.from_service_account_info(google_service_account_info)
|
24 |
|
25 |
|
26 |
-
def process_images(diary_image, writer_image):
|
27 |
# Save the file-like objects as image files
|
28 |
diary_image_path = "temp_upload_images/temp_diary_image.png"
|
29 |
writer_image_path = "temp_upload_images/temp_writer_image.png"
|
@@ -42,8 +43,13 @@ def process_images(diary_image, writer_image):
|
|
42 |
scene_list = [scene.strip() for scene in scenes.split("Scene")[1:]]
|
43 |
scene_list = [scene.split(": ", 1)[1] for scene in scene_list]
|
44 |
|
|
|
|
|
|
|
|
|
|
|
45 |
# Generate the video based on the summaries
|
46 |
-
video_path
|
47 |
|
48 |
caption = scenes_caption(scene_list, openai_api_key)
|
49 |
|
@@ -51,9 +57,9 @@ def process_images(diary_image, writer_image):
|
|
51 |
|
52 |
|
53 |
# Define the Gradio interface
|
54 |
-
def gradio_interface(diary_image, writer_image):
|
55 |
# Process the images and generate the video
|
56 |
-
video_paths, prompts = process_images(diary_image, writer_image)
|
57 |
|
58 |
# Return the paths and corresponding prompts
|
59 |
return video_paths, prompts
|
@@ -68,6 +74,14 @@ with gr.Blocks() as interface:
|
|
68 |
with gr.Column():
|
69 |
diary_image_input = gr.Image(label="Upload your handwritten diary image", type="pil")
|
70 |
writer_image_input = gr.Image(label="Upload a photo of the writer", type="pil")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
submit_button = gr.Button("Generate Video")
|
72 |
|
73 |
# Right column for generated video and caption
|
@@ -78,9 +92,9 @@ with gr.Blocks() as interface:
|
|
78 |
# Bind the submit button click to trigger the video generation and display
|
79 |
submit_button.click(
|
80 |
fn=gradio_interface,
|
81 |
-
inputs=[diary_image_input, writer_image_input],
|
82 |
outputs=[video_output, caption_output]
|
83 |
)
|
84 |
|
85 |
# Launch the interface
|
86 |
-
interface.launch()
|
|
|
13 |
openai_api_key = os.getenv("OPENAI_API_KEY")
|
14 |
google_service_account_info = json.loads(os.getenv("GOOGLE_SERVICE_ACCOUNT"))
|
15 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
16 |
+
eleven_api_key = os.getenv("ELEVEN_API_KEY")
|
17 |
+
eleven_api_key = "sk_992a2f46b6cd194bb8613c93063bfba646ed20a555d8528e"
|
18 |
# Initialize OpenAI
|
19 |
openai.api_key = openai_api_key
|
20 |
|
|
|
24 |
return service_account.Credentials.from_service_account_info(google_service_account_info)
|
25 |
|
26 |
|
27 |
+
def process_images(diary_image, writer_image, audio_option):
|
28 |
# Save the file-like objects as image files
|
29 |
diary_image_path = "temp_upload_images/temp_diary_image.png"
|
30 |
writer_image_path = "temp_upload_images/temp_writer_image.png"
|
|
|
43 |
scene_list = [scene.strip() for scene in scenes.split("Scene")[1:]]
|
44 |
scene_list = [scene.split(": ", 1)[1] for scene in scene_list]
|
45 |
|
46 |
+
# Generate the summaries for audio narration
|
47 |
+
audio_summaries = summarizer_for_audio(detected_text)
|
48 |
+
# Generate the narration under main file
|
49 |
+
narration_generate(audio_summaries, eleven_api_key)
|
50 |
+
|
51 |
# Generate the video based on the summaries
|
52 |
+
video_path= generate_video(scene_list, writer_summary, audio_option, fps=24)
|
53 |
|
54 |
caption = scenes_caption(scene_list, openai_api_key)
|
55 |
|
|
|
57 |
|
58 |
|
59 |
# Define the Gradio interface
|
60 |
+
def gradio_interface(diary_image, writer_image, audio_option):
|
61 |
# Process the images and generate the video
|
62 |
+
video_paths, prompts = process_images(diary_image, writer_image, audio_option)
|
63 |
|
64 |
# Return the paths and corresponding prompts
|
65 |
return video_paths, prompts
|
|
|
74 |
with gr.Column():
|
75 |
diary_image_input = gr.Image(label="Upload your handwritten diary image", type="pil")
|
76 |
writer_image_input = gr.Image(label="Upload a photo of the writer", type="pil")
|
77 |
+
|
78 |
+
# Add a radio button for selecting audio options
|
79 |
+
audio_option = gr.Radio(
|
80 |
+
["Narration", "Meow"],
|
81 |
+
label="Choose Audio Option",
|
82 |
+
value="Narration" # Default selection
|
83 |
+
)
|
84 |
+
|
85 |
submit_button = gr.Button("Generate Video")
|
86 |
|
87 |
# Right column for generated video and caption
|
|
|
92 |
# Bind the submit button click to trigger the video generation and display
|
93 |
submit_button.click(
|
94 |
fn=gradio_interface,
|
95 |
+
inputs=[diary_image_input, writer_image_input, audio_option],
|
96 |
outputs=[video_output, caption_output]
|
97 |
)
|
98 |
|
99 |
# Launch the interface
|
100 |
+
interface.launch(debug=True)
|
baseline_utils.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import openai
|
|
|
2 |
from google.cloud import vision
|
3 |
import io
|
4 |
import google.generativeai as genai
|
@@ -8,6 +9,9 @@ from diffusers.utils import export_to_video
|
|
8 |
import os
|
9 |
import spaces
|
10 |
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
|
|
|
|
|
|
|
11 |
|
12 |
# Utilize the Google Cloud Vision API to recognize text in the
|
13 |
# input input_images (diary input_images), https://cloud.google.com/vision.
|
@@ -44,15 +48,15 @@ def analyze_writer_image(image_path, api_key):
|
|
44 |
myfile = genai.upload_file(image_path)
|
45 |
color = model.generate_content([myfile,"What is the predominant color of the person in the image?"])
|
46 |
description = f"""
|
47 |
-
The main character is a cartoonish, fluffy cat with large, expressive blue eyes.
|
48 |
Its fur is predominantly {color}, with subtle shading on certain parts of its body in a slightly darker or lighter shade of {color}.
|
49 |
The face is round with soft, slightly pointed ears that are highlighted with an inner coloring also in {color}.
|
50 |
|
51 |
-
The most prominent feature of the cat is its extremely fluffy, oversized tail, which arcs gracefully above its body.
|
52 |
-
The tail fur is thick, feathery, and has a luxurious texture that stands out against the rest of the body, showcasing
|
53 |
-
a gradient effect from darker to lighter shades of {color} at the edges.
|
54 |
|
55 |
-
The cat’s paws are small and round, with shading in a slightly darker shade of {color}.
|
56 |
The overall look of the figure is cute, gentle, and meticulously detailed, emphasizing a soft and playful appearance.
|
57 |
"""
|
58 |
return description
|
@@ -120,7 +124,7 @@ def scenes_caption(scenes, api_key):
|
|
120 |
|
121 |
|
122 |
@spaces.GPU
|
123 |
-
def generate_video(scene_list, writer_description, fps=24): # Lower fps
|
124 |
|
125 |
pipe = CogVideoXPipeline.from_pretrained(
|
126 |
"THUDM/CogVideoX-5b",
|
@@ -160,12 +164,15 @@ def generate_video(scene_list, writer_description, fps=24): # Lower fps
|
|
160 |
video_paths.append(video_path)
|
161 |
|
162 |
# Concatenate the generated videos into a single video
|
163 |
-
concatenated_video_path = "videos/
|
164 |
-
|
|
|
|
|
|
|
165 |
return concatenated_video_path
|
166 |
|
167 |
|
168 |
-
def
|
169 |
# Load each video file as a VideoFileClip
|
170 |
clips = [VideoFileClip(video) for video in video_paths]
|
171 |
|
@@ -183,3 +190,39 @@ def concatenate_videos(video_paths, output_path, audio_path="meow-meow-meow-tikt
|
|
183 |
|
184 |
# Write the concatenated video to a file
|
185 |
final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import openai
|
2 |
+
|
3 |
from google.cloud import vision
|
4 |
import io
|
5 |
import google.generativeai as genai
|
|
|
9 |
import os
|
10 |
import spaces
|
11 |
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
|
12 |
+
from transformers import pipeline
|
13 |
+
import requests
|
14 |
+
from transformers import pipeline
|
15 |
|
16 |
# Utilize the Google Cloud Vision API to recognize text in the
|
17 |
# input input_images (diary input_images), https://cloud.google.com/vision.
|
|
|
48 |
myfile = genai.upload_file(image_path)
|
49 |
color = model.generate_content([myfile,"What is the predominant color of the person in the image?"])
|
50 |
description = f"""
|
51 |
+
The main character is a cartoonish, fluffy cat with large, expressive blue eyes.
|
52 |
Its fur is predominantly {color}, with subtle shading on certain parts of its body in a slightly darker or lighter shade of {color}.
|
53 |
The face is round with soft, slightly pointed ears that are highlighted with an inner coloring also in {color}.
|
54 |
|
55 |
+
The most prominent feature of the cat is its extremely fluffy, oversized tail, which arcs gracefully above its body.
|
56 |
+
The tail fur is thick, feathery, and has a luxurious texture that stands out against the rest of the body, showcasing
|
57 |
+
a gradient effect from darker to lighter shades of {color} at the edges.
|
58 |
|
59 |
+
The cat’s paws are small and round, with shading in a slightly darker shade of {color}.
|
60 |
The overall look of the figure is cute, gentle, and meticulously detailed, emphasizing a soft and playful appearance.
|
61 |
"""
|
62 |
return description
|
|
|
124 |
|
125 |
|
126 |
@spaces.GPU
|
127 |
+
def generate_video(scene_list, writer_description, opt, fps=24): # Lower fps
|
128 |
|
129 |
pipe = CogVideoXPipeline.from_pretrained(
|
130 |
"THUDM/CogVideoX-5b",
|
|
|
164 |
video_paths.append(video_path)
|
165 |
|
166 |
# Concatenate the generated videos into a single video
|
167 |
+
concatenated_video_path = "videos/combined_video_music.mp4"
|
168 |
+
if opt == "Narration":
|
169 |
+
concatenate_videos_music(video_paths, concatenated_video_path, audio_path="narration.mp3")
|
170 |
+
else:
|
171 |
+
concatenate_videos_music(video_paths, concatenated_video_path, audio_path="meow-meow-meow-tiktok.mp3")
|
172 |
return concatenated_video_path
|
173 |
|
174 |
|
175 |
+
def concatenate_videos_music(video_paths, output_path, audio_path):
|
176 |
# Load each video file as a VideoFileClip
|
177 |
clips = [VideoFileClip(video) for video in video_paths]
|
178 |
|
|
|
190 |
|
191 |
# Write the concatenated video to a file
|
192 |
final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
193 |
+
|
194 |
+
def summarizer_for_audio(input_text):
|
195 |
+
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
196 |
+
|
197 |
+
# Generate the summary
|
198 |
+
summary = summarizer(
|
199 |
+
input_text,
|
200 |
+
max_length=40,
|
201 |
+
min_length=30,
|
202 |
+
do_sample=False
|
203 |
+
)[0]["summary_text"]
|
204 |
+
|
205 |
+
return summary
|
206 |
+
|
207 |
+
def narration_generate(input, api_key):
|
208 |
+
url = "https://api.elevenlabs.io/v1/text-to-speech/9BWtsMINqrJLrRacOk9x"
|
209 |
+
headers = {
|
210 |
+
"Accept": "audio/mpeg",
|
211 |
+
"Content-Type": "application/json",
|
212 |
+
"xi-api-key": api_key
|
213 |
+
}
|
214 |
+
|
215 |
+
data = {
|
216 |
+
"text": input,
|
217 |
+
"model_id": "eleven_monolingual_v1",
|
218 |
+
"voice_settings": {
|
219 |
+
"stability": 0.5,
|
220 |
+
"similarity_boost": 0.5
|
221 |
+
}
|
222 |
+
}
|
223 |
+
|
224 |
+
response = requests.post(url, json=data, headers=headers)
|
225 |
+
with open('narration.mp3', 'wb') as f:
|
226 |
+
for chunk in response.iter_content(chunk_size=1024):
|
227 |
+
if chunk:
|
228 |
+
f.write(chunk)
|