jjz5463 commited on
Commit
5de6ba9
·
1 Parent(s): 75c0fdd

super model

Browse files
Files changed (2) hide show
  1. app.py +7 -5
  2. baseline_utils.py +63 -47
app.py CHANGED
@@ -5,7 +5,7 @@ from google.oauth2 import service_account
5
  from baseline_utils import (detect_text_in_image,
6
  analyze_writer_image,
7
  generate_video,
8
- break_summary_to_activities)
9
  import os
10
 
11
  # Load secrets from Hugging Face Spaces environment
@@ -33,16 +33,18 @@ def process_images(diary_image, writer_image):
33
  # Detect text from the diary image
34
  google_credentials = get_google_credentials()
35
  detected_text = detect_text_in_image(diary_image_path, google_credentials)
36
- activities = break_summary_to_activities(detected_text, openai_api_key)
37
- activity_list = activities.strip('[]').split(', ')
38
 
39
  # Analyze the writer's image using Gemini API
40
  writer_summary = analyze_writer_image(writer_image_path, gemini_api_key)
41
 
 
 
 
 
42
  # Generate the video based on the summaries
43
- video_paths = generate_video(activity_list, writer_summary, fps=24)
44
 
45
- return video_paths, activity_list
46
 
47
 
48
  # Define the Gradio interface
 
5
  from baseline_utils import (detect_text_in_image,
6
  analyze_writer_image,
7
  generate_video,
8
+ break_diary_to_scenes)
9
  import os
10
 
11
  # Load secrets from Hugging Face Spaces environment
 
33
  # Detect text from the diary image
34
  google_credentials = get_google_credentials()
35
  detected_text = detect_text_in_image(diary_image_path, google_credentials)
 
 
36
 
37
  # Analyze the writer's image using Gemini API
38
  writer_summary = analyze_writer_image(writer_image_path, gemini_api_key)
39
 
40
+ scenes = break_diary_to_scenes(detected_text, writer_summary, openai_api_key)
41
+ scene_list = [scene.strip() for scene in scenes.split("Scene")[1:]]
42
+ scene_list = [scene.split(": ", 1)[1] for scene in scene_list]
43
+
44
  # Generate the video based on the summaries
45
+ video_paths = generate_video(scene_list, fps=24)
46
 
47
+ return video_paths, scene_list
48
 
49
 
50
  # Define the Gradio interface
baseline_utils.py CHANGED
@@ -2,10 +2,9 @@ import openai
2
  from google.cloud import vision
3
  import io
4
  import google.generativeai as genai
5
- from diffusers import DiffusionPipeline
6
  import torch
7
  from diffusers.utils import export_to_video
8
- import numpy as np
9
  import os
10
  import spaces
11
 
@@ -35,24 +34,50 @@ def detect_text_in_image(image_path, credentials):
35
  return texts[0].description if texts else ''
36
 
37
 
38
- def break_summary_to_activities(text, api_key):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  # Initialize the OpenAI client
40
  client = openai.Client(api_key=api_key)
41
 
 
 
 
 
 
 
 
42
  # Use the client to call the chat completion API
43
  response = client.chat.completions.create(
44
  model="gpt-4", # Use GPT-4
45
  messages=[
46
- {"role": "user",
47
- "content": f"Please break the following diary into exactly four most important activities. "
48
- f"Each activity must be formatted as 'I am [activity]' and must describe only one specific action. "
49
- f"Make sure each activity is distinct and only contains a single action (e.g., no combinations like 'eating and teaching'). "
50
- f"Additionally, each activity should be no more than six words: {text}. "
51
- f"Return the four activities as a list in the following format: "
52
- f"[activity1, activity2, activity3, activity4], without any quotation marks, extra text, or explanations."}
 
53
  ],
54
- max_tokens=150,
55
- temperature=0.7,
56
  n=1 # Number of completions to generate
57
  )
58
 
@@ -60,32 +85,25 @@ def break_summary_to_activities(text, api_key):
60
  return response.choices[0].message.content
61
 
62
 
63
- # Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
64
- # and output a textual description of the image,
65
- # https://ai.google.dev/gemini-api/docs/models/gemini.
66
- # Mock example assuming an API request to Gemini
67
- def analyze_writer_image(image_path, api_key):
68
- genai.configure(api_key=api_key)
69
- model = genai.GenerativeModel("gemini-1.5-flash")
70
- myfile = genai.upload_file(image_path)
71
- result = model.generate_content(
72
- [myfile,
73
- "Provide a description of the people in the picture, "
74
- "focusing on their characteristics. Keep it under two words "
75
- "and ensure the description does not contain any line breaks, extra spaces, or unnecessary characters at the end."]
76
- )
77
- return result.text
78
-
79
-
80
  @spaces.GPU
81
- def generate_video(activity_list, writer_summary, fps=24): # Lower fps
82
  # Load the Zeroscope video generation model
83
- pipe = DiffusionPipeline.from_pretrained(
84
- "cerspense/zeroscope_v2_576w", # Zeroscope model from Hugging Face
85
- torch_dtype=torch.float16,
86
- cache_dir = "./zeroscope"
 
 
 
 
 
 
87
  )
88
 
 
 
 
 
89
  # Check for available device: CUDA, MPS, or CPU
90
  if torch.cuda.is_available():
91
  device = "cuda"
@@ -96,26 +114,24 @@ def generate_video(activity_list, writer_summary, fps=24): # Lower fps
96
  else:
97
  device = "cpu"
98
  print("CUDA and MPS not available. Falling back to CPU.")
99
- pipe = pipe.to(device)
100
-
101
- # Combine the diary text and writer description for a cohesive prompt
102
- prompts = []
103
- for activity in activity_list:
104
- prompt = writer_summary.strip('.').capitalize() + ' is' + activity[4:]
105
- prompts.append(prompt)
106
 
107
  # Truncate the prompt to fit the CLIP token limit
108
  os.makedirs("videos", exist_ok=True)
109
  video_paths = []
110
- for i, prompt in enumerate(prompts):
111
- video_frames = pipe(prompt, num_inference_steps=60, height=320, width=576, num_frames=fps).frames
112
- video_path = export_to_video(np.squeeze(video_frames, axis=0), output_video_path=f'videos/video{i}.mp4')
 
 
 
 
 
 
 
 
113
  video_paths.append(video_path)
114
 
115
  return video_paths
116
 
117
 
118
 
119
-
120
-
121
-
 
2
  from google.cloud import vision
3
  import io
4
  import google.generativeai as genai
5
+ from diffusers import CogVideoXPipeline
6
  import torch
7
  from diffusers.utils import export_to_video
 
8
  import os
9
  import spaces
10
 
 
34
  return texts[0].description if texts else ''
35
 
36
 
37
+ # Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
38
+ # and output a textual description of the image,
39
+ # https://ai.google.dev/gemini-api/docs/models/gemini.
40
+ # Mock example assuming an API request to Gemini
41
+ def analyze_writer_image(image_path, api_key):
42
+ genai.configure(api_key=api_key)
43
+ model = genai.GenerativeModel("gemini-1.5-flash")
44
+ myfile = genai.upload_file(image_path)
45
+ result = model.generate_content(
46
+ [myfile,
47
+ "Provide a detailed description of the people in the picture, "
48
+ "focusing on their characteristics in an animated style. "
49
+ "Use analogies to animals to describe their traits. "
50
+ "Ensure the description is concise, with no line breaks, extra spaces, or unnecessary characters at the end."]
51
+ )
52
+ return result.text
53
+
54
+
55
+ def break_diary_to_scenes(diary_text, writer_description, api_key):
56
  # Initialize the OpenAI client
57
  client = openai.Client(api_key=api_key)
58
 
59
+
60
+ example_1 = 'A garden comes to life as a kaleidoscope of butterflies flutters amidst the blossoms, their delicate wings casting shadows on the petals below. In the background, a grand fountain cascades water with a gentle splendor, its rhythmic sound providing a soothing backdrop. Beneath the cool shade of a mature tree, a solitary wooden chair invites solitude and reflection, its smooth surface worn by the touch of countless visitors seeking a moment of tranquility in nature\'s embrace.'
61
+ example_2 = 'A small boy, head bowed and determination etched on his face, sprints through the torrential downpour as lightning crackles and thunder rumbles in the distance. The relentless rain pounds the ground, creating a chaotic dance of water droplets that mirror the dramatic sky\'s anger. In the far background, the silhouette of a cozy home beckons, a faint beacon of safety and warmth amidst the fierce weather. The scene is one of perseverance and the unyielding spirit of a child braving the elements.'
62
+ example_3 = 'A suited astronaut, with the red dust of Mars clinging to their boots, reaches out to shake hands with an alien being, their skin a shimmering blue, under the pink-tinged sky of the fourth planet. In the background, a sleek silver rocket, a beacon of human ingenuity, stands tall, its engines powered down, as the two representatives of different worlds exchange a historic greeting amidst the desolate beauty of the Martian landscape.'
63
+ example_4 = 'An elderly gentleman, with a serene expression, sits at the water\'s edge, a steaming cup of tea by his side. He is engrossed in his artwork, brush in hand, as he renders an oil painting on a canvas that\'s propped up against a small, weathered table. The sea breeze whispers through his silver hair, gently billowing his loose-fitting white shirt, while the salty air adds an intangible element to his masterpiece in progress. The scene is one of tranquility and inspiration, with the artist\'s canvas capturing the vibrant hues of the setting sun reflecting off the tranquil sea.'
64
+ example_5 = 'A golden retriever, sporting sleek black sunglasses, with its lengthy fur flowing in the breeze, sprints playfully across a rooftop terrace, recently refreshed by a light rain. The scene unfolds from a distance, the dog\'s energetic bounds growing larger as it approaches the camera, its tail wagging with unrestrained joy, while droplets of water glisten on the concrete behind it. The overcast sky provides a dramatic backdrop, emphasizing the vibrant golden coat of the canine as it dashes towards the viewer.'
65
+
66
  # Use the client to call the chat completion API
67
  response = client.chat.completions.create(
68
  model="gpt-4", # Use GPT-4
69
  messages=[
70
+ {
71
+ "role": "user",
72
+ "content": f"Please break the following diary into four distinct movie scenes: {diary_text}. Each scene should focus on one unique action and be described in vivid, animated detail. Below are some examples for the desired style: "
73
+ f"Example 1: {example_1}. Example 2: {example_2}. Example 3: {example_3}. Example 4: {example_4}. Example 5: {example_5}. "
74
+ f"Ensure that each scene features only one action, with no combinations (e.g., avoid 'eating and teaching' in one scene). The main character is described as: {writer_description}. "
75
+ f"Please use expressive, cinematic language to bring the scene to life, focusing on the character’s actions, expressions, and environment. "
76
+ f"Return the output as a list in this format: Scene 1: , Scene 2: , Scene 3: , Scene 4: , without any quotation marks or line breaks."
77
+ }
78
  ],
79
+ max_tokens=1000,
80
+ temperature=1,
81
  n=1 # Number of completions to generate
82
  )
83
 
 
85
  return response.choices[0].message.content
86
 
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  @spaces.GPU
89
+ def generate_video(scene_list, fps=24): # Lower fps
90
  # Load the Zeroscope video generation model
91
+ # pipe = DiffusionPipeline.from_pretrained(
92
+ # "cerspense/zeroscope_v2_576w", # Zeroscope model from Hugging Face
93
+ # torch_dtype=torch.float16,
94
+ # cache_dir = "./zeroscope"
95
+ # )
96
+
97
+ pipe = CogVideoXPipeline.from_pretrained(
98
+ "THUDM/CogVideoX-5b",
99
+ torch_dtype=torch.bfloat16,
100
+ cache_dir="./CogVideoX-5b"
101
  )
102
 
103
+ pipe.enable_model_cpu_offload()
104
+ pipe.vae.enable_tiling()
105
+
106
+
107
  # Check for available device: CUDA, MPS, or CPU
108
  if torch.cuda.is_available():
109
  device = "cuda"
 
114
  else:
115
  device = "cpu"
116
  print("CUDA and MPS not available. Falling back to CPU.")
 
 
 
 
 
 
 
117
 
118
  # Truncate the prompt to fit the CLIP token limit
119
  os.makedirs("videos", exist_ok=True)
120
  video_paths = []
121
+ for i, prompt in enumerate(scene_list):
122
+ video = pipe(
123
+ prompt=prompt,
124
+ num_videos_per_prompt=1,
125
+ num_inference_steps=50,
126
+ num_frames=49,
127
+ guidance_scale=6,
128
+ generator=torch.Generator(device=device).manual_seed(42),
129
+ ).frames[0]
130
+
131
+ video_path = export_to_video(video, output_video_path=f'videos/video{i}.mp4')
132
  video_paths.append(video_path)
133
 
134
  return video_paths
135
 
136
 
137