Spaces:

jjz5463
/

Diary-AI-Video

Paused

App Files Files Community

jjz5463 commited on Oct 4, 2024

Commit

5de6ba9

1 Parent(s): 75c0fdd

super model

Browse files

Files changed (2) hide show

app.py +7 -5
baseline_utils.py +63 -47

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ from google.oauth2 import service_account
 from baseline_utils import (detect_text_in_image,
                             analyze_writer_image,
                             generate_video,
-                            break_summary_to_activities)
 import os
 # Load secrets from Hugging Face Spaces environment
@@ -33,16 +33,18 @@ def process_images(diary_image, writer_image):
     # Detect text from the diary image
     google_credentials = get_google_credentials()
     detected_text = detect_text_in_image(diary_image_path, google_credentials)
-    activities = break_summary_to_activities(detected_text, openai_api_key)
-    activity_list = activities.strip('[]').split(', ')
     # Analyze the writer's image using Gemini API
     writer_summary = analyze_writer_image(writer_image_path, gemini_api_key)
     # Generate the video based on the summaries
-    video_paths = generate_video(activity_list, writer_summary, fps=24)
-    return video_paths, activity_list
 # Define the Gradio interface

 from baseline_utils import (detect_text_in_image,
                             analyze_writer_image,
                             generate_video,
+                            break_diary_to_scenes)
 import os
 # Load secrets from Hugging Face Spaces environment
     # Detect text from the diary image
     google_credentials = get_google_credentials()
     detected_text = detect_text_in_image(diary_image_path, google_credentials)
     # Analyze the writer's image using Gemini API
     writer_summary = analyze_writer_image(writer_image_path, gemini_api_key)
+    scenes = break_diary_to_scenes(detected_text, writer_summary, openai_api_key)
+    scene_list = [scene.strip() for scene in scenes.split("Scene")[1:]]
+    scene_list = [scene.split(": ", 1)[1] for scene in scene_list]
     # Generate the video based on the summaries
+    video_paths = generate_video(scene_list, fps=24)
+    return video_paths, scene_list
 # Define the Gradio interface

baseline_utils.py CHANGED Viewed

@@ -2,10 +2,9 @@ import openai
 from google.cloud import vision
 import io
 import google.generativeai as genai
-from diffusers import DiffusionPipeline
 import torch
 from diffusers.utils import export_to_video
-import numpy as np
 import os
 import spaces
@@ -35,24 +34,50 @@ def detect_text_in_image(image_path, credentials):
     return texts[0].description if texts else ''
-def break_summary_to_activities(text, api_key):
     # Initialize the OpenAI client
     client = openai.Client(api_key=api_key)
     # Use the client to call the chat completion API
     response = client.chat.completions.create(
         model="gpt-4",  # Use GPT-4
         messages=[
-            {"role": "user",
-             "content": f"Please break the following diary into exactly four most important activities. "
-                        f"Each activity must be formatted as 'I am [activity]' and must describe only one specific action. "
-                        f"Make sure each activity is distinct and only contains a single action (e.g., no combinations like 'eating and teaching'). "
-                        f"Additionally, each activity should be no more than six words: {text}. "
-                        f"Return the four activities as a list in the following format: "
-                        f"[activity1, activity2, activity3, activity4], without any quotation marks, extra text, or explanations."}
         ],
-        max_tokens=150,
-        temperature=0.7,
         n=1  # Number of completions to generate
     )
@@ -60,32 +85,25 @@ def break_summary_to_activities(text, api_key):
     return response.choices[0].message.content
-# Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
-# and output a textual description of the image,
-# https://ai.google.dev/gemini-api/docs/models/gemini.
-# Mock example assuming an API request to Gemini
-def analyze_writer_image(image_path, api_key):
-    genai.configure(api_key=api_key)
-    model = genai.GenerativeModel("gemini-1.5-flash")
-    myfile = genai.upload_file(image_path)
-    result = model.generate_content(
-        [myfile,
-         "Provide a description of the people in the picture, "
-         "focusing on their characteristics. Keep it under two words "
-         "and ensure the description does not contain any line breaks, extra spaces, or unnecessary characters at the end."]
-    )
-    return result.text
 @spaces.GPU
-def generate_video(activity_list, writer_summary, fps=24):  # Lower fps
     # Load the Zeroscope video generation model
-    pipe = DiffusionPipeline.from_pretrained(
-        "cerspense/zeroscope_v2_576w",  # Zeroscope model from Hugging Face
-        torch_dtype=torch.float16,
-        cache_dir = "./zeroscope"
     )
     # Check for available device: CUDA, MPS, or CPU
     if torch.cuda.is_available():
         device = "cuda"
@@ -96,26 +114,24 @@ def generate_video(activity_list, writer_summary, fps=24):  # Lower fps
     else:
         device = "cpu"
         print("CUDA and MPS not available. Falling back to CPU.")
-    pipe = pipe.to(device)
-    # Combine the diary text and writer description for a cohesive prompt
-    prompts = []
-    for activity in activity_list:
-        prompt = writer_summary.strip('.').capitalize() + ' is' + activity[4:]
-        prompts.append(prompt)
     # Truncate the prompt to fit the CLIP token limit
     os.makedirs("videos", exist_ok=True)
     video_paths = []
-    for i, prompt in enumerate(prompts):
-        video_frames = pipe(prompt, num_inference_steps=60, height=320, width=576, num_frames=fps).frames
-        video_path = export_to_video(np.squeeze(video_frames, axis=0), output_video_path=f'videos/video{i}.mp4')
         video_paths.append(video_path)
     return video_paths

 from google.cloud import vision
 import io
 import google.generativeai as genai
+from diffusers import CogVideoXPipeline
 import torch
 from diffusers.utils import export_to_video
 import os
 import spaces
     return texts[0].description if texts else ''
+# Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
+# and output a textual description of the image,
+# https://ai.google.dev/gemini-api/docs/models/gemini.
+# Mock example assuming an API request to Gemini
+def analyze_writer_image(image_path, api_key):
+    genai.configure(api_key=api_key)
+    model = genai.GenerativeModel("gemini-1.5-flash")
+    myfile = genai.upload_file(image_path)
+    result = model.generate_content(
+        [myfile,
+         "Provide a detailed description of the people in the picture, "
+         "focusing on their characteristics in an animated style. "
+         "Use analogies to animals to describe their traits. "
+         "Ensure the description is concise, with no line breaks, extra spaces, or unnecessary characters at the end."]
+    )
+    return result.text
+def break_diary_to_scenes(diary_text, writer_description, api_key):
     # Initialize the OpenAI client
     client = openai.Client(api_key=api_key)
+    example_1 = 'A garden comes to life as a kaleidoscope of butterflies flutters amidst the blossoms, their delicate wings casting shadows on the petals below. In the background, a grand fountain cascades water with a gentle splendor, its rhythmic sound providing a soothing backdrop. Beneath the cool shade of a mature tree, a solitary wooden chair invites solitude and reflection, its smooth surface worn by the touch of countless visitors seeking a moment of tranquility in nature\'s embrace.'
+    example_2 = 'A small boy, head bowed and determination etched on his face, sprints through the torrential downpour as lightning crackles and thunder rumbles in the distance. The relentless rain pounds the ground, creating a chaotic dance of water droplets that mirror the dramatic sky\'s anger. In the far background, the silhouette of a cozy home beckons, a faint beacon of safety and warmth amidst the fierce weather. The scene is one of perseverance and the unyielding spirit of a child braving the elements.'
+    example_3 = 'A suited astronaut, with the red dust of Mars clinging to their boots, reaches out to shake hands with an alien being, their skin a shimmering blue, under the pink-tinged sky of the fourth planet. In the background, a sleek silver rocket, a beacon of human ingenuity, stands tall, its engines powered down, as the two representatives of different worlds exchange a historic greeting amidst the desolate beauty of the Martian landscape.'
+    example_4 = 'An elderly gentleman, with a serene expression, sits at the water\'s edge, a steaming cup of tea by his side. He is engrossed in his artwork, brush in hand, as he renders an oil painting on a canvas that\'s propped up against a small, weathered table. The sea breeze whispers through his silver hair, gently billowing his loose-fitting white shirt, while the salty air adds an intangible element to his masterpiece in progress. The scene is one of tranquility and inspiration, with the artist\'s canvas capturing the vibrant hues of the setting sun reflecting off the tranquil sea.'
+    example_5 = 'A golden retriever, sporting sleek black sunglasses, with its lengthy fur flowing in the breeze, sprints playfully across a rooftop terrace, recently refreshed by a light rain. The scene unfolds from a distance, the dog\'s energetic bounds growing larger as it approaches the camera, its tail wagging with unrestrained joy, while droplets of water glisten on the concrete behind it. The overcast sky provides a dramatic backdrop, emphasizing the vibrant golden coat of the canine as it dashes towards the viewer.'
     # Use the client to call the chat completion API
     response = client.chat.completions.create(
         model="gpt-4",  # Use GPT-4
         messages=[
+            {
+                "role": "user",
+                "content": f"Please break the following diary into four distinct movie scenes: {diary_text}. Each scene should focus on one unique action and be described in vivid, animated detail. Below are some examples for the desired style: "
+                           f"Example 1: {example_1}. Example 2: {example_2}. Example 3: {example_3}. Example 4: {example_4}. Example 5: {example_5}. "
+                           f"Ensure that each scene features only one action, with no combinations (e.g., avoid 'eating and teaching' in one scene). The main character is described as: {writer_description}. "
+                           f"Please use expressive, cinematic language to bring the scene to life, focusing on the character’s actions, expressions, and environment. "
+                           f"Return the output as a list in this format: Scene 1: , Scene 2: , Scene 3: , Scene 4: , without any quotation marks or line breaks."
+            }
         ],
+        max_tokens=1000,
+        temperature=1,
         n=1  # Number of completions to generate
     )
     return response.choices[0].message.content
 @spaces.GPU
+def generate_video(scene_list, fps=24):  # Lower fps
     # Load the Zeroscope video generation model
+    # pipe = DiffusionPipeline.from_pretrained(
+    #     "cerspense/zeroscope_v2_576w",  # Zeroscope model from Hugging Face
+    #     torch_dtype=torch.float16,
+    #     cache_dir = "./zeroscope"
+    # )
+    pipe = CogVideoXPipeline.from_pretrained(
+        "THUDM/CogVideoX-5b",
+        torch_dtype=torch.bfloat16,
+        cache_dir="./CogVideoX-5b"
     )
+    pipe.enable_model_cpu_offload()
+    pipe.vae.enable_tiling()
     # Check for available device: CUDA, MPS, or CPU
     if torch.cuda.is_available():
         device = "cuda"
     else:
         device = "cpu"
         print("CUDA and MPS not available. Falling back to CPU.")
     # Truncate the prompt to fit the CLIP token limit
     os.makedirs("videos", exist_ok=True)
     video_paths = []
+    for i, prompt in enumerate(scene_list):
+        video = pipe(
+            prompt=prompt,
+            num_videos_per_prompt=1,
+            num_inference_steps=50,
+            num_frames=49,
+            guidance_scale=6,
+            generator=torch.Generator(device=device).manual_seed(42),
+        ).frames[0]
+        video_path = export_to_video(video, output_video_path=f'videos/video{i}.mp4')
         video_paths.append(video_path)
     return video_paths