jjz5463 commited on
Commit
379b5e6
·
1 Parent(s): 544723a
Files changed (2) hide show
  1. app.py +21 -7
  2. baseline_utils.py +52 -9
app.py CHANGED
@@ -13,7 +13,8 @@ import os
13
  openai_api_key = os.getenv("OPENAI_API_KEY")
14
  google_service_account_info = json.loads(os.getenv("GOOGLE_SERVICE_ACCOUNT"))
15
  gemini_api_key = os.getenv("GEMINI_API_KEY")
16
-
 
17
  # Initialize OpenAI
18
  openai.api_key = openai_api_key
19
 
@@ -23,7 +24,7 @@ def get_google_credentials():
23
  return service_account.Credentials.from_service_account_info(google_service_account_info)
24
 
25
 
26
- def process_images(diary_image, writer_image):
27
  # Save the file-like objects as image files
28
  diary_image_path = "temp_upload_images/temp_diary_image.png"
29
  writer_image_path = "temp_upload_images/temp_writer_image.png"
@@ -42,8 +43,13 @@ def process_images(diary_image, writer_image):
42
  scene_list = [scene.strip() for scene in scenes.split("Scene")[1:]]
43
  scene_list = [scene.split(": ", 1)[1] for scene in scene_list]
44
 
 
 
 
 
 
45
  # Generate the video based on the summaries
46
- video_path = generate_video(scene_list, writer_summary, fps=24)
47
 
48
  caption = scenes_caption(scene_list, openai_api_key)
49
 
@@ -51,9 +57,9 @@ def process_images(diary_image, writer_image):
51
 
52
 
53
  # Define the Gradio interface
54
- def gradio_interface(diary_image, writer_image):
55
  # Process the images and generate the video
56
- video_paths, prompts = process_images(diary_image, writer_image)
57
 
58
  # Return the paths and corresponding prompts
59
  return video_paths, prompts
@@ -68,6 +74,14 @@ with gr.Blocks() as interface:
68
  with gr.Column():
69
  diary_image_input = gr.Image(label="Upload your handwritten diary image", type="pil")
70
  writer_image_input = gr.Image(label="Upload a photo of the writer", type="pil")
 
 
 
 
 
 
 
 
71
  submit_button = gr.Button("Generate Video")
72
 
73
  # Right column for generated video and caption
@@ -78,9 +92,9 @@ with gr.Blocks() as interface:
78
  # Bind the submit button click to trigger the video generation and display
79
  submit_button.click(
80
  fn=gradio_interface,
81
- inputs=[diary_image_input, writer_image_input],
82
  outputs=[video_output, caption_output]
83
  )
84
 
85
  # Launch the interface
86
- interface.launch()
 
13
  openai_api_key = os.getenv("OPENAI_API_KEY")
14
  google_service_account_info = json.loads(os.getenv("GOOGLE_SERVICE_ACCOUNT"))
15
  gemini_api_key = os.getenv("GEMINI_API_KEY")
16
+ eleven_api_key = os.getenv("ELEVEN_API_KEY")
17
+ eleven_api_key = "sk_992a2f46b6cd194bb8613c93063bfba646ed20a555d8528e"
18
  # Initialize OpenAI
19
  openai.api_key = openai_api_key
20
 
 
24
  return service_account.Credentials.from_service_account_info(google_service_account_info)
25
 
26
 
27
+ def process_images(diary_image, writer_image, audio_option):
28
  # Save the file-like objects as image files
29
  diary_image_path = "temp_upload_images/temp_diary_image.png"
30
  writer_image_path = "temp_upload_images/temp_writer_image.png"
 
43
  scene_list = [scene.strip() for scene in scenes.split("Scene")[1:]]
44
  scene_list = [scene.split(": ", 1)[1] for scene in scene_list]
45
 
46
+ # Generate the summaries for audio narration
47
+ audio_summaries = summarizer_for_audio(detected_text)
48
+ # Generate the narration under main file
49
+ narration_generate(audio_summaries, eleven_api_key)
50
+
51
  # Generate the video based on the summaries
52
+ video_path= generate_video(scene_list, writer_summary, audio_option, fps=24)
53
 
54
  caption = scenes_caption(scene_list, openai_api_key)
55
 
 
57
 
58
 
59
  # Define the Gradio interface
60
+ def gradio_interface(diary_image, writer_image, audio_option):
61
  # Process the images and generate the video
62
+ video_paths, prompts = process_images(diary_image, writer_image, audio_option)
63
 
64
  # Return the paths and corresponding prompts
65
  return video_paths, prompts
 
74
  with gr.Column():
75
  diary_image_input = gr.Image(label="Upload your handwritten diary image", type="pil")
76
  writer_image_input = gr.Image(label="Upload a photo of the writer", type="pil")
77
+
78
+ # Add a radio button for selecting audio options
79
+ audio_option = gr.Radio(
80
+ ["Narration", "Meow"],
81
+ label="Choose Audio Option",
82
+ value="Narration" # Default selection
83
+ )
84
+
85
  submit_button = gr.Button("Generate Video")
86
 
87
  # Right column for generated video and caption
 
92
  # Bind the submit button click to trigger the video generation and display
93
  submit_button.click(
94
  fn=gradio_interface,
95
+ inputs=[diary_image_input, writer_image_input, audio_option],
96
  outputs=[video_output, caption_output]
97
  )
98
 
99
  # Launch the interface
100
+ interface.launch(debug=True)
baseline_utils.py CHANGED
@@ -1,4 +1,5 @@
1
  import openai
 
2
  from google.cloud import vision
3
  import io
4
  import google.generativeai as genai
@@ -8,6 +9,9 @@ from diffusers.utils import export_to_video
8
  import os
9
  import spaces
10
  from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
 
 
 
11
 
12
  # Utilize the Google Cloud Vision API to recognize text in the
13
  # input input_images (diary input_images), https://cloud.google.com/vision.
@@ -44,15 +48,15 @@ def analyze_writer_image(image_path, api_key):
44
  myfile = genai.upload_file(image_path)
45
  color = model.generate_content([myfile,"What is the predominant color of the person in the image?"])
46
  description = f"""
47
- The main character is a cartoonish, fluffy cat with large, expressive blue eyes.
48
  Its fur is predominantly {color}, with subtle shading on certain parts of its body in a slightly darker or lighter shade of {color}.
49
  The face is round with soft, slightly pointed ears that are highlighted with an inner coloring also in {color}.
50
 
51
- The most prominent feature of the cat is its extremely fluffy, oversized tail, which arcs gracefully above its body.
52
- The tail fur is thick, feathery, and has a luxurious texture that stands out against the rest of the body, showcasing
53
- a gradient effect from darker to lighter shades of {color} at the edges.
54
 
55
- The cat’s paws are small and round, with shading in a slightly darker shade of {color}.
56
  The overall look of the figure is cute, gentle, and meticulously detailed, emphasizing a soft and playful appearance.
57
  """
58
  return description
@@ -120,7 +124,7 @@ def scenes_caption(scenes, api_key):
120
 
121
 
122
  @spaces.GPU
123
- def generate_video(scene_list, writer_description, fps=24): # Lower fps
124
 
125
  pipe = CogVideoXPipeline.from_pretrained(
126
  "THUDM/CogVideoX-5b",
@@ -160,12 +164,15 @@ def generate_video(scene_list, writer_description, fps=24): # Lower fps
160
  video_paths.append(video_path)
161
 
162
  # Concatenate the generated videos into a single video
163
- concatenated_video_path = "videos/combined_video.mp4"
164
- concatenate_videos(video_paths, concatenated_video_path, audio_path="meow-meow-meow-tiktok.mp3")
 
 
 
165
  return concatenated_video_path
166
 
167
 
168
- def concatenate_videos(video_paths, output_path, audio_path="meow-meow-meow-tiktok.mp3"):
169
  # Load each video file as a VideoFileClip
170
  clips = [VideoFileClip(video) for video in video_paths]
171
 
@@ -183,3 +190,39 @@ def concatenate_videos(video_paths, output_path, audio_path="meow-meow-meow-tikt
183
 
184
  # Write the concatenated video to a file
185
  final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import openai
2
+
3
  from google.cloud import vision
4
  import io
5
  import google.generativeai as genai
 
9
  import os
10
  import spaces
11
  from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
12
+ from transformers import pipeline
13
+ import requests
14
+ from transformers import pipeline
15
 
16
  # Utilize the Google Cloud Vision API to recognize text in the
17
  # input input_images (diary input_images), https://cloud.google.com/vision.
 
48
  myfile = genai.upload_file(image_path)
49
  color = model.generate_content([myfile,"What is the predominant color of the person in the image?"])
50
  description = f"""
51
+ The main character is a cartoonish, fluffy cat with large, expressive blue eyes.
52
  Its fur is predominantly {color}, with subtle shading on certain parts of its body in a slightly darker or lighter shade of {color}.
53
  The face is round with soft, slightly pointed ears that are highlighted with an inner coloring also in {color}.
54
 
55
+ The most prominent feature of the cat is its extremely fluffy, oversized tail, which arcs gracefully above its body.
56
+ The tail fur is thick, feathery, and has a luxurious texture that stands out against the rest of the body, showcasing
57
+ a gradient effect from darker to lighter shades of {color} at the edges.
58
 
59
+ The cat’s paws are small and round, with shading in a slightly darker shade of {color}.
60
  The overall look of the figure is cute, gentle, and meticulously detailed, emphasizing a soft and playful appearance.
61
  """
62
  return description
 
124
 
125
 
126
  @spaces.GPU
127
+ def generate_video(scene_list, writer_description, opt, fps=24): # Lower fps
128
 
129
  pipe = CogVideoXPipeline.from_pretrained(
130
  "THUDM/CogVideoX-5b",
 
164
  video_paths.append(video_path)
165
 
166
  # Concatenate the generated videos into a single video
167
+ concatenated_video_path = "videos/combined_video_music.mp4"
168
+ if opt == "Narration":
169
+ concatenate_videos_music(video_paths, concatenated_video_path, audio_path="narration.mp3")
170
+ else:
171
+ concatenate_videos_music(video_paths, concatenated_video_path, audio_path="meow-meow-meow-tiktok.mp3")
172
  return concatenated_video_path
173
 
174
 
175
+ def concatenate_videos_music(video_paths, output_path, audio_path):
176
  # Load each video file as a VideoFileClip
177
  clips = [VideoFileClip(video) for video in video_paths]
178
 
 
190
 
191
  # Write the concatenated video to a file
192
  final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac")
193
+
194
+ def summarizer_for_audio(input_text):
195
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
196
+
197
+ # Generate the summary
198
+ summary = summarizer(
199
+ input_text,
200
+ max_length=40,
201
+ min_length=30,
202
+ do_sample=False
203
+ )[0]["summary_text"]
204
+
205
+ return summary
206
+
207
+ def narration_generate(input, api_key):
208
+ url = "https://api.elevenlabs.io/v1/text-to-speech/9BWtsMINqrJLrRacOk9x"
209
+ headers = {
210
+ "Accept": "audio/mpeg",
211
+ "Content-Type": "application/json",
212
+ "xi-api-key": api_key
213
+ }
214
+
215
+ data = {
216
+ "text": input,
217
+ "model_id": "eleven_monolingual_v1",
218
+ "voice_settings": {
219
+ "stability": 0.5,
220
+ "similarity_boost": 0.5
221
+ }
222
+ }
223
+
224
+ response = requests.post(url, json=data, headers=headers)
225
+ with open('narration.mp3', 'wb') as f:
226
+ for chunk in response.iter_content(chunk_size=1024):
227
+ if chunk:
228
+ f.write(chunk)