Spaces:
Runtime error
Runtime error
Delete src/app.py
Browse files- src/app.py +0 -222
src/app.py
DELETED
@@ -1,222 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import os
|
3 |
-
import numpy as np
|
4 |
-
from gradio_utils import *
|
5 |
-
|
6 |
-
def image_mod(image):
|
7 |
-
return image.rotate(45)
|
8 |
-
|
9 |
-
import os
|
10 |
-
|
11 |
-
import sys
|
12 |
-
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
13 |
-
|
14 |
-
|
15 |
-
import cv2
|
16 |
-
import numpy as np
|
17 |
-
import torch
|
18 |
-
import torch.nn.functional as F
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
from models.pipelines import TextToVideoSDPipelineSpatialAware
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
NUM_POINTS = 3
|
28 |
-
NUM_FRAMES = 24
|
29 |
-
LARGE_BOX_SIZE = 256
|
30 |
-
|
31 |
-
|
32 |
-
def generate_video(pipe, overall_prompt, latents, get_latents=False, num_frames=24, num_inference_steps=50, fg_masks=None,
|
33 |
-
fg_masked_latents=None, frozen_steps=0, frozen_prompt=None, custom_attention_mask=None, fg_prompt=None):
|
34 |
-
|
35 |
-
video_frames = pipe(overall_prompt, num_frames=num_frames, latents=latents, num_inference_steps=num_inference_steps, frozen_mask=fg_masks,
|
36 |
-
frozen_steps=frozen_steps, latents_all_input=fg_masked_latents, frozen_prompt=frozen_prompt, custom_attention_mask=custom_attention_mask, fg_prompt=fg_prompt,
|
37 |
-
make_attention_mask_2d=True, attention_mask_block_diagonal=True, height=320, width=576 ).frames
|
38 |
-
if get_latents:
|
39 |
-
video_latents = pipe(overall_prompt, num_frames=num_frames, latents=latents, num_inference_steps=num_inference_steps, output_type="latent").frames
|
40 |
-
return video_frames, video_latents
|
41 |
-
|
42 |
-
return video_frames
|
43 |
-
|
44 |
-
|
45 |
-
# def generate_bb(prompt, fg_object, aspect_ratio, size, trajectory):
|
46 |
-
|
47 |
-
# if len(trajectory['layers']) < NUM_POINTS:
|
48 |
-
# raise ValueError
|
49 |
-
# final_canvas = torch.zeros((NUM_FRAMES,320,576))
|
50 |
-
|
51 |
-
# bbox_size_x = LARGE_BOX_SIZE if size == "large" else int(LARGE_BOX_SIZE * 0.75) if size == "medium" else LARGE_BOX_SIZE//2
|
52 |
-
# bbox_size_y = bbox_size_x if aspect_ratio == "square" else int(bbox_size_x * 0.75) if aspect_ratio == "horizontal" else int(bbox_size_x * 1.25)
|
53 |
-
|
54 |
-
# bbox_coords = []
|
55 |
-
# # TODO add checks for trajectory
|
56 |
-
# for t in trajectory['layers']:
|
57 |
-
# bbox_coords.append([int(t.sum(axis=-2).argmax()*576/800), int(t.sum(axis=-1)[140:460].argmax())])
|
58 |
-
# bbox_coords = np.array(bbox_coords)
|
59 |
-
# # Make a list of length 24
|
60 |
-
# # Each element is a list of length 2
|
61 |
-
# # First element is the x coordinate of the bbox
|
62 |
-
# # Second element is a set of y coordinates of the bbox
|
63 |
-
# new_bbox_coords = [np.zeros(2,) for i in range(NUM_FRAMES)]
|
64 |
-
# divisor = int(NUM_FRAMES / (NUM_POINTS-1))
|
65 |
-
# for i in range(NUM_POINTS-1):
|
66 |
-
# new_bbox_coords[i*divisor] = bbox_coords[i]
|
67 |
-
# new_bbox_coords[-1] = bbox_coords[-1]
|
68 |
-
|
69 |
-
# # Linearly interpolate in the middle
|
70 |
-
# for i in range(NUM_POINTS-1):
|
71 |
-
# for j in range(1,divisor):
|
72 |
-
# new_bbox_coords[i*divisor+j][1] = int((bbox_coords[i][0] * (divisor-j) + bbox_coords[(i+1)][0] * j) / divisor)
|
73 |
-
# new_bbox_coords[i*divisor+j][0] = int((bbox_coords[i][1] * (divisor-j) + bbox_coords[(i+1)][1] * j) / divisor)
|
74 |
-
|
75 |
-
# for i in range(NUM_FRAMES):
|
76 |
-
# x = int(new_bbox_coords[i][0])
|
77 |
-
# y = int(new_bbox_coords[i][1])
|
78 |
-
# final_canvas[i,int(x-bbox_size_x/2):int(x+bbox_size_x/2), int(y-bbox_size_y/2):int(y+bbox_size_y/2)] = 1
|
79 |
-
|
80 |
-
# torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
81 |
-
# try:
|
82 |
-
# pipe = TextToVideoSDPipelineSpatialAware.from_pretrained(
|
83 |
-
# "cerspense/zeroscope_v2_576w", torch_dtype=torch.float, variant="fp32").to(torch_device)
|
84 |
-
# except:
|
85 |
-
# pipe = TextToVideoSDPipelineSpatialAware.from_pretrained(
|
86 |
-
# "cerspense/zeroscope_v2_576w", torch_dtype=torch.float, variant="fp32").to(torch_device)
|
87 |
-
|
88 |
-
# fg_masks = F.interpolate(final_canvas.unsqueeze(1), size=(40,72), mode="nearest").to(torch_device)
|
89 |
-
|
90 |
-
# # Save fg_masks as images
|
91 |
-
# for i in range(NUM_FRAMES):
|
92 |
-
# cv2.imwrite(f"./fg_masks/frame_{i:04d}.png", fg_masks[i,0].cpu().numpy()*255)
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
# seed = 2
|
97 |
-
# random_latents = torch.randn([1, 4, NUM_FRAMES, 40, 72], generator=torch.Generator().manual_seed(seed)).to(torch_device)
|
98 |
-
# overall_prompt = f"A realistic lively {prompt}"
|
99 |
-
# video_frames = generate_video(pipe, overall_prompt, random_latents, get_latents=False, num_frames=NUM_FRAMES, num_inference_steps=40,
|
100 |
-
# fg_masks=fg_masks, fg_masked_latents=None, frozen_steps=2, frozen_prompt=None, fg_prompt=fg_object)
|
101 |
-
|
102 |
-
# return create_video(video_frames,fps=8, type="final")
|
103 |
-
|
104 |
-
|
105 |
-
def interpolate_points(points, target_length):
|
106 |
-
print(points)
|
107 |
-
if len(points) == target_length:
|
108 |
-
return points
|
109 |
-
elif len(points) > target_length:
|
110 |
-
# Subsample the points uniformly
|
111 |
-
indices = np.round(np.linspace(0, len(points) - 1, target_length)).astype(int)
|
112 |
-
return [points[i] for i in indices]
|
113 |
-
else:
|
114 |
-
# Linearly interpolate to get more points
|
115 |
-
interpolated_points = []
|
116 |
-
num_points_to_add = target_length - len(points)
|
117 |
-
points_added_per_segment = num_points_to_add // (len(points) - 1)
|
118 |
-
|
119 |
-
for i in range(len(points) - 1):
|
120 |
-
start, end = points[i], points[i + 1]
|
121 |
-
interpolated_points.append(start)
|
122 |
-
for j in range(1, points_added_per_segment + 1):
|
123 |
-
fraction = j / (points_added_per_segment + 1)
|
124 |
-
new_point = np.round(start + fraction * (end - start))
|
125 |
-
interpolated_points.append(new_point)
|
126 |
-
|
127 |
-
# Add the last point
|
128 |
-
interpolated_points.append(points[-1])
|
129 |
-
|
130 |
-
# If there are still not enough points, add extras at the end
|
131 |
-
while len(interpolated_points) < target_length:
|
132 |
-
interpolated_points.append(points[-1])
|
133 |
-
|
134 |
-
return interpolated_points
|
135 |
-
|
136 |
-
|
137 |
-
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
138 |
-
|
139 |
-
|
140 |
-
try:
|
141 |
-
pipe = TextToVideoSDPipelineSpatialAware.from_pretrained(
|
142 |
-
"cerspense/zeroscope_v2_576w", torch_dtype=torch.float, variant="fp32").to(torch_device)
|
143 |
-
except:
|
144 |
-
pipe = TextToVideoSDPipelineSpatialAware.from_pretrained(
|
145 |
-
"cerspense/zeroscope_v2_576w", torch_dtype=torch.float, variant="fp32").to(torch_device)
|
146 |
-
|
147 |
-
|
148 |
-
def generate_bb(prompt, fg_object, aspect_ratio, size, motion_direction, trajectory):
|
149 |
-
|
150 |
-
# if len(trajectory['layers']) < NUM_POINTS:
|
151 |
-
# raise ValueError
|
152 |
-
final_canvas = torch.zeros((NUM_FRAMES,320//8,576//8))
|
153 |
-
|
154 |
-
bbox_size_x = LARGE_BOX_SIZE if size == "large" else int(LARGE_BOX_SIZE * 0.75) if size == "medium" else LARGE_BOX_SIZE//2
|
155 |
-
bbox_size_y = bbox_size_x if aspect_ratio == "square" else int(bbox_size_x * 1.33) if aspect_ratio == "horizontal" else int(bbox_size_x * 0.75)
|
156 |
-
|
157 |
-
bbox_coords = []
|
158 |
-
|
159 |
-
image = trajectory['composite']
|
160 |
-
print(image.shape)
|
161 |
-
|
162 |
-
image = cv2.resize(image,(576, 320))
|
163 |
-
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
164 |
-
_, thresh = cv2.threshold(gray, 30, 255, cv2.THRESH_BINARY_INV)
|
165 |
-
contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
166 |
-
|
167 |
-
|
168 |
-
# Process each contour
|
169 |
-
bbox_points = []
|
170 |
-
for contour in contours:
|
171 |
-
# You can approximate the contour to reduce the number of points
|
172 |
-
epsilon = 0.01 * cv2.arcLength(contour, True)
|
173 |
-
approx = cv2.approxPolyDP(contour, epsilon, True)
|
174 |
-
|
175 |
-
# Extracting and printing coordinates
|
176 |
-
for point in approx:
|
177 |
-
y, x = point.ravel()
|
178 |
-
if x in range(1,319) and y in range(1,575):
|
179 |
-
bbox_points.append([x,y])
|
180 |
-
|
181 |
-
if motion_direction in ['l2r', 'r2l']:
|
182 |
-
sorted_points = sorted(bbox_points, key=lambda x: x[1], reverse=motion_direction=="r2l")
|
183 |
-
else:
|
184 |
-
sorted_points = sorted(bbox_points, key=lambda x: x[0], reverse=motion_direction=="d2u")
|
185 |
-
target_length = 24
|
186 |
-
final_points = interpolate_points(np.array(sorted_points), target_length)
|
187 |
-
|
188 |
-
# Remember to reverse the co-ordinates
|
189 |
-
for i in range(NUM_FRAMES):
|
190 |
-
x = int(final_points[i][0])
|
191 |
-
y = int(final_points[i][1])
|
192 |
-
# Added Padding
|
193 |
-
final_canvas[i, max(int(x-bbox_size_x/2),16) // 8:min(int(x+bbox_size_x/2), 304)// 8,
|
194 |
-
max(int(y-bbox_size_y/2),16)// 8:min(int(y+bbox_size_y/2),560)// 8] = 1
|
195 |
-
|
196 |
-
|
197 |
-
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
198 |
-
fg_masks = final_canvas.unsqueeze(1).to(torch_device)
|
199 |
-
# # Save fg_masks as images
|
200 |
-
for i in range(NUM_FRAMES):
|
201 |
-
cv2.imwrite(f"./fg_masks/frame_{i:04d}.png", fg_masks[i,0].cpu().numpy()*255)
|
202 |
-
|
203 |
-
seed = 2
|
204 |
-
random_latents = torch.randn([1, 4, NUM_FRAMES, 40, 72], generator=torch.Generator().manual_seed(seed)).to(torch_device)
|
205 |
-
overall_prompt = f"A realistic lively {prompt}"
|
206 |
-
video_frames = generate_video(pipe, overall_prompt, random_latents, get_latents=False, num_frames=NUM_FRAMES, num_inference_steps=40,
|
207 |
-
fg_masks=fg_masks, fg_masked_latents=None, frozen_steps=2, frozen_prompt=None, fg_prompt=fg_object)
|
208 |
-
|
209 |
-
return create_video(video_frames,fps=8, type="final")
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
demo = gr.Interface(
|
214 |
-
fn=generate_bb,
|
215 |
-
inputs=["text", "text", gr.Radio(choices=["square", "horizontal", "vertical"]), gr.Radio(choices=["small", "medium", "large"]), gr.Radio(choices=["l2r", "r2l", "u2d", "d2u"]),
|
216 |
-
gr.Paint(value={'background':np.zeros((320,576)), 'layers': [], 'composite': np.zeros((320,576))},type="numpy", image_mode="RGB", height=320, width=576)],
|
217 |
-
outputs=gr.Video(),
|
218 |
-
)
|
219 |
-
|
220 |
-
|
221 |
-
if __name__ == "__main__":
|
222 |
-
demo.launch(share=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|