Spaces:

jjz5463
/

Diary-AI-Video

Paused

App Files Files Community

jjz5463 commited on Dec 5, 2024

Commit

379b5e6

1 Parent(s): 544723a

narrative

Browse files

Files changed (2) hide show

app.py +21 -7
baseline_utils.py +52 -9

app.py CHANGED Viewed

@@ -13,7 +13,8 @@ import os
 openai_api_key = os.getenv("OPENAI_API_KEY")
 google_service_account_info = json.loads(os.getenv("GOOGLE_SERVICE_ACCOUNT"))
 gemini_api_key = os.getenv("GEMINI_API_KEY")
 # Initialize OpenAI
 openai.api_key = openai_api_key
@@ -23,7 +24,7 @@ def get_google_credentials():
     return service_account.Credentials.from_service_account_info(google_service_account_info)
-def process_images(diary_image, writer_image):
     # Save the file-like objects as image files
     diary_image_path = "temp_upload_images/temp_diary_image.png"
     writer_image_path = "temp_upload_images/temp_writer_image.png"
@@ -42,8 +43,13 @@ def process_images(diary_image, writer_image):
     scene_list = [scene.strip() for scene in scenes.split("Scene")[1:]]
     scene_list = [scene.split(": ", 1)[1] for scene in scene_list]
     # Generate the video based on the summaries
-    video_path = generate_video(scene_list, writer_summary, fps=24)
     caption = scenes_caption(scene_list, openai_api_key)
@@ -51,9 +57,9 @@ def process_images(diary_image, writer_image):
 # Define the Gradio interface
-def gradio_interface(diary_image, writer_image):
     # Process the images and generate the video
-    video_paths, prompts = process_images(diary_image, writer_image)
     # Return the paths and corresponding prompts
     return video_paths, prompts
@@ -68,6 +74,14 @@ with gr.Blocks() as interface:
         with gr.Column():
             diary_image_input = gr.Image(label="Upload your handwritten diary image", type="pil")
             writer_image_input = gr.Image(label="Upload a photo of the writer", type="pil")
             submit_button = gr.Button("Generate Video")
         # Right column for generated video and caption
@@ -78,9 +92,9 @@ with gr.Blocks() as interface:
     # Bind the submit button click to trigger the video generation and display
     submit_button.click(
         fn=gradio_interface,
-        inputs=[diary_image_input, writer_image_input],
         outputs=[video_output, caption_output]
     )
 # Launch the interface
-interface.launch()

 openai_api_key = os.getenv("OPENAI_API_KEY")
 google_service_account_info = json.loads(os.getenv("GOOGLE_SERVICE_ACCOUNT"))
 gemini_api_key = os.getenv("GEMINI_API_KEY")
+eleven_api_key = os.getenv("ELEVEN_API_KEY")
+eleven_api_key = "sk_992a2f46b6cd194bb8613c93063bfba646ed20a555d8528e"
 # Initialize OpenAI
 openai.api_key = openai_api_key
     return service_account.Credentials.from_service_account_info(google_service_account_info)
+def process_images(diary_image, writer_image, audio_option):
     # Save the file-like objects as image files
     diary_image_path = "temp_upload_images/temp_diary_image.png"
     writer_image_path = "temp_upload_images/temp_writer_image.png"
     scene_list = [scene.strip() for scene in scenes.split("Scene")[1:]]
     scene_list = [scene.split(": ", 1)[1] for scene in scene_list]
+    # Generate the summaries for audio narration
+    audio_summaries = summarizer_for_audio(detected_text)
+    # Generate the narration under main file
+    narration_generate(audio_summaries, eleven_api_key)
     # Generate the video based on the summaries
+    video_path= generate_video(scene_list, writer_summary, audio_option, fps=24)
     caption = scenes_caption(scene_list, openai_api_key)
 # Define the Gradio interface
+def gradio_interface(diary_image, writer_image, audio_option):
     # Process the images and generate the video
+    video_paths, prompts = process_images(diary_image, writer_image, audio_option)
     # Return the paths and corresponding prompts
     return video_paths, prompts
         with gr.Column():
             diary_image_input = gr.Image(label="Upload your handwritten diary image", type="pil")
             writer_image_input = gr.Image(label="Upload a photo of the writer", type="pil")
+            # Add a radio button for selecting audio options
+            audio_option = gr.Radio(
+                ["Narration", "Meow"],
+                label="Choose Audio Option",
+                value="Narration"  # Default selection
+            )
             submit_button = gr.Button("Generate Video")
         # Right column for generated video and caption
     # Bind the submit button click to trigger the video generation and display
     submit_button.click(
         fn=gradio_interface,
+        inputs=[diary_image_input, writer_image_input, audio_option],
         outputs=[video_output, caption_output]
     )
 # Launch the interface
+interface.launch(debug=True)

baseline_utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import openai
 from google.cloud import vision
 import io
 import google.generativeai as genai
@@ -8,6 +9,9 @@ from diffusers.utils import export_to_video
 import os
 import spaces
 from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
 # Utilize the Google Cloud Vision API to recognize text in the
 # input input_images (diary input_images), https://cloud.google.com/vision.
@@ -44,15 +48,15 @@ def analyze_writer_image(image_path, api_key):
     myfile = genai.upload_file(image_path)
     color = model.generate_content([myfile,"What is the predominant color of the person in the image?"])
     description = f"""
-    The main character is a cartoonish, fluffy cat with large, expressive blue eyes.
     Its fur is predominantly {color}, with subtle shading on certain parts of its body in a slightly darker or lighter shade of {color}.
     The face is round with soft, slightly pointed ears that are highlighted with an inner coloring also in {color}.
-    The most prominent feature of the cat is its extremely fluffy, oversized tail, which arcs gracefully above its body.
-    The tail fur is thick, feathery, and has a luxurious texture that stands out against the rest of the body, showcasing
-    a gradient effect from darker to lighter shades of {color} at the edges.
-    The cat’s paws are small and round, with shading in a slightly darker shade of {color}.
     The overall look of the figure is cute, gentle, and meticulously detailed, emphasizing a soft and playful appearance.
     """
     return description
@@ -120,7 +124,7 @@ def scenes_caption(scenes, api_key):
 @spaces.GPU
-def generate_video(scene_list, writer_description, fps=24):  # Lower fps
     pipe = CogVideoXPipeline.from_pretrained(
         "THUDM/CogVideoX-5b",
@@ -160,12 +164,15 @@ def generate_video(scene_list, writer_description, fps=24):  # Lower fps
         video_paths.append(video_path)
     # Concatenate the generated videos into a single video
-    concatenated_video_path = "videos/combined_video.mp4"
-    concatenate_videos(video_paths, concatenated_video_path, audio_path="meow-meow-meow-tiktok.mp3")
     return concatenated_video_path
-def concatenate_videos(video_paths, output_path, audio_path="meow-meow-meow-tiktok.mp3"):
     # Load each video file as a VideoFileClip
     clips = [VideoFileClip(video) for video in video_paths]
@@ -183,3 +190,39 @@ def concatenate_videos(video_paths, output_path, audio_path="meow-meow-meow-tikt
     # Write the concatenated video to a file
     final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac")

 import openai
 from google.cloud import vision
 import io
 import google.generativeai as genai
 import os
 import spaces
 from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
+from transformers import pipeline
+import requests
+from transformers import pipeline
 # Utilize the Google Cloud Vision API to recognize text in the
 # input input_images (diary input_images), https://cloud.google.com/vision.
     myfile = genai.upload_file(image_path)
     color = model.generate_content([myfile,"What is the predominant color of the person in the image?"])
     description = f"""
+    The main character is a cartoonish, fluffy cat with large, expressive blue eyes.
     Its fur is predominantly {color}, with subtle shading on certain parts of its body in a slightly darker or lighter shade of {color}.
     The face is round with soft, slightly pointed ears that are highlighted with an inner coloring also in {color}.
+    The most prominent feature of the cat is its extremely fluffy, oversized tail, which arcs gracefully above its body.
+    The tail fur is thick, feathery, and has a luxurious texture that stands out against the rest of the body, showcasing
+    a gradient effect from darker to lighter shades of {color} at the edges.
+    The cat’s paws are small and round, with shading in a slightly darker shade of {color}.
     The overall look of the figure is cute, gentle, and meticulously detailed, emphasizing a soft and playful appearance.
     """
     return description
 @spaces.GPU
+def generate_video(scene_list, writer_description, opt, fps=24):  # Lower fps
     pipe = CogVideoXPipeline.from_pretrained(
         "THUDM/CogVideoX-5b",
         video_paths.append(video_path)
     # Concatenate the generated videos into a single video
+    concatenated_video_path = "videos/combined_video_music.mp4"
+    if opt == "Narration":
+      concatenate_videos_music(video_paths, concatenated_video_path, audio_path="narration.mp3")
+    else:
+      concatenate_videos_music(video_paths, concatenated_video_path, audio_path="meow-meow-meow-tiktok.mp3")
     return concatenated_video_path
+def concatenate_videos_music(video_paths, output_path, audio_path):
     # Load each video file as a VideoFileClip
     clips = [VideoFileClip(video) for video in video_paths]
     # Write the concatenated video to a file
     final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac")
+def summarizer_for_audio(input_text):
+  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+  # Generate the summary
+  summary = summarizer(
+      input_text,
+      max_length=40,
+      min_length=30,
+      do_sample=False
+  )[0]["summary_text"]
+  return summary
+def narration_generate(input, api_key):
+  url = "https://api.elevenlabs.io/v1/text-to-speech/9BWtsMINqrJLrRacOk9x"
+  headers = {
+  "Accept": "audio/mpeg",
+  "Content-Type": "application/json",
+  "xi-api-key": api_key
+  }
+  data = {
+    "text": input,
+    "model_id": "eleven_monolingual_v1",
+    "voice_settings": {
+      "stability": 0.5,
+      "similarity_boost": 0.5
+    }
+  }
+  response = requests.post(url, json=data, headers=headers)
+  with open('narration.mp3', 'wb') as f:
+      for chunk in response.iter_content(chunk_size=1024):
+          if chunk:
+              f.write(chunk)