anshuln commited on
Commit
2d30cbe
·
verified ·
1 Parent(s): 759e47c

Delete src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +0 -222
src/app.py DELETED
@@ -1,222 +0,0 @@
1
- import gradio as gr
2
- import os
3
- import numpy as np
4
- from gradio_utils import *
5
-
6
- def image_mod(image):
7
- return image.rotate(45)
8
-
9
- import os
10
-
11
- import sys
12
- sys.path.insert(1, os.path.join(sys.path[0], '..'))
13
-
14
-
15
- import cv2
16
- import numpy as np
17
- import torch
18
- import torch.nn.functional as F
19
-
20
-
21
-
22
-
23
- from models.pipelines import TextToVideoSDPipelineSpatialAware
24
-
25
-
26
-
27
- NUM_POINTS = 3
28
- NUM_FRAMES = 24
29
- LARGE_BOX_SIZE = 256
30
-
31
-
32
- def generate_video(pipe, overall_prompt, latents, get_latents=False, num_frames=24, num_inference_steps=50, fg_masks=None,
33
- fg_masked_latents=None, frozen_steps=0, frozen_prompt=None, custom_attention_mask=None, fg_prompt=None):
34
-
35
- video_frames = pipe(overall_prompt, num_frames=num_frames, latents=latents, num_inference_steps=num_inference_steps, frozen_mask=fg_masks,
36
- frozen_steps=frozen_steps, latents_all_input=fg_masked_latents, frozen_prompt=frozen_prompt, custom_attention_mask=custom_attention_mask, fg_prompt=fg_prompt,
37
- make_attention_mask_2d=True, attention_mask_block_diagonal=True, height=320, width=576 ).frames
38
- if get_latents:
39
- video_latents = pipe(overall_prompt, num_frames=num_frames, latents=latents, num_inference_steps=num_inference_steps, output_type="latent").frames
40
- return video_frames, video_latents
41
-
42
- return video_frames
43
-
44
-
45
- # def generate_bb(prompt, fg_object, aspect_ratio, size, trajectory):
46
-
47
- # if len(trajectory['layers']) < NUM_POINTS:
48
- # raise ValueError
49
- # final_canvas = torch.zeros((NUM_FRAMES,320,576))
50
-
51
- # bbox_size_x = LARGE_BOX_SIZE if size == "large" else int(LARGE_BOX_SIZE * 0.75) if size == "medium" else LARGE_BOX_SIZE//2
52
- # bbox_size_y = bbox_size_x if aspect_ratio == "square" else int(bbox_size_x * 0.75) if aspect_ratio == "horizontal" else int(bbox_size_x * 1.25)
53
-
54
- # bbox_coords = []
55
- # # TODO add checks for trajectory
56
- # for t in trajectory['layers']:
57
- # bbox_coords.append([int(t.sum(axis=-2).argmax()*576/800), int(t.sum(axis=-1)[140:460].argmax())])
58
- # bbox_coords = np.array(bbox_coords)
59
- # # Make a list of length 24
60
- # # Each element is a list of length 2
61
- # # First element is the x coordinate of the bbox
62
- # # Second element is a set of y coordinates of the bbox
63
- # new_bbox_coords = [np.zeros(2,) for i in range(NUM_FRAMES)]
64
- # divisor = int(NUM_FRAMES / (NUM_POINTS-1))
65
- # for i in range(NUM_POINTS-1):
66
- # new_bbox_coords[i*divisor] = bbox_coords[i]
67
- # new_bbox_coords[-1] = bbox_coords[-1]
68
-
69
- # # Linearly interpolate in the middle
70
- # for i in range(NUM_POINTS-1):
71
- # for j in range(1,divisor):
72
- # new_bbox_coords[i*divisor+j][1] = int((bbox_coords[i][0] * (divisor-j) + bbox_coords[(i+1)][0] * j) / divisor)
73
- # new_bbox_coords[i*divisor+j][0] = int((bbox_coords[i][1] * (divisor-j) + bbox_coords[(i+1)][1] * j) / divisor)
74
-
75
- # for i in range(NUM_FRAMES):
76
- # x = int(new_bbox_coords[i][0])
77
- # y = int(new_bbox_coords[i][1])
78
- # final_canvas[i,int(x-bbox_size_x/2):int(x+bbox_size_x/2), int(y-bbox_size_y/2):int(y+bbox_size_y/2)] = 1
79
-
80
- # torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
81
- # try:
82
- # pipe = TextToVideoSDPipelineSpatialAware.from_pretrained(
83
- # "cerspense/zeroscope_v2_576w", torch_dtype=torch.float, variant="fp32").to(torch_device)
84
- # except:
85
- # pipe = TextToVideoSDPipelineSpatialAware.from_pretrained(
86
- # "cerspense/zeroscope_v2_576w", torch_dtype=torch.float, variant="fp32").to(torch_device)
87
-
88
- # fg_masks = F.interpolate(final_canvas.unsqueeze(1), size=(40,72), mode="nearest").to(torch_device)
89
-
90
- # # Save fg_masks as images
91
- # for i in range(NUM_FRAMES):
92
- # cv2.imwrite(f"./fg_masks/frame_{i:04d}.png", fg_masks[i,0].cpu().numpy()*255)
93
-
94
-
95
-
96
- # seed = 2
97
- # random_latents = torch.randn([1, 4, NUM_FRAMES, 40, 72], generator=torch.Generator().manual_seed(seed)).to(torch_device)
98
- # overall_prompt = f"A realistic lively {prompt}"
99
- # video_frames = generate_video(pipe, overall_prompt, random_latents, get_latents=False, num_frames=NUM_FRAMES, num_inference_steps=40,
100
- # fg_masks=fg_masks, fg_masked_latents=None, frozen_steps=2, frozen_prompt=None, fg_prompt=fg_object)
101
-
102
- # return create_video(video_frames,fps=8, type="final")
103
-
104
-
105
- def interpolate_points(points, target_length):
106
- print(points)
107
- if len(points) == target_length:
108
- return points
109
- elif len(points) > target_length:
110
- # Subsample the points uniformly
111
- indices = np.round(np.linspace(0, len(points) - 1, target_length)).astype(int)
112
- return [points[i] for i in indices]
113
- else:
114
- # Linearly interpolate to get more points
115
- interpolated_points = []
116
- num_points_to_add = target_length - len(points)
117
- points_added_per_segment = num_points_to_add // (len(points) - 1)
118
-
119
- for i in range(len(points) - 1):
120
- start, end = points[i], points[i + 1]
121
- interpolated_points.append(start)
122
- for j in range(1, points_added_per_segment + 1):
123
- fraction = j / (points_added_per_segment + 1)
124
- new_point = np.round(start + fraction * (end - start))
125
- interpolated_points.append(new_point)
126
-
127
- # Add the last point
128
- interpolated_points.append(points[-1])
129
-
130
- # If there are still not enough points, add extras at the end
131
- while len(interpolated_points) < target_length:
132
- interpolated_points.append(points[-1])
133
-
134
- return interpolated_points
135
-
136
-
137
- torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
138
-
139
-
140
- try:
141
- pipe = TextToVideoSDPipelineSpatialAware.from_pretrained(
142
- "cerspense/zeroscope_v2_576w", torch_dtype=torch.float, variant="fp32").to(torch_device)
143
- except:
144
- pipe = TextToVideoSDPipelineSpatialAware.from_pretrained(
145
- "cerspense/zeroscope_v2_576w", torch_dtype=torch.float, variant="fp32").to(torch_device)
146
-
147
-
148
- def generate_bb(prompt, fg_object, aspect_ratio, size, motion_direction, trajectory):
149
-
150
- # if len(trajectory['layers']) < NUM_POINTS:
151
- # raise ValueError
152
- final_canvas = torch.zeros((NUM_FRAMES,320//8,576//8))
153
-
154
- bbox_size_x = LARGE_BOX_SIZE if size == "large" else int(LARGE_BOX_SIZE * 0.75) if size == "medium" else LARGE_BOX_SIZE//2
155
- bbox_size_y = bbox_size_x if aspect_ratio == "square" else int(bbox_size_x * 1.33) if aspect_ratio == "horizontal" else int(bbox_size_x * 0.75)
156
-
157
- bbox_coords = []
158
-
159
- image = trajectory['composite']
160
- print(image.shape)
161
-
162
- image = cv2.resize(image,(576, 320))
163
- gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
164
- _, thresh = cv2.threshold(gray, 30, 255, cv2.THRESH_BINARY_INV)
165
- contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
166
-
167
-
168
- # Process each contour
169
- bbox_points = []
170
- for contour in contours:
171
- # You can approximate the contour to reduce the number of points
172
- epsilon = 0.01 * cv2.arcLength(contour, True)
173
- approx = cv2.approxPolyDP(contour, epsilon, True)
174
-
175
- # Extracting and printing coordinates
176
- for point in approx:
177
- y, x = point.ravel()
178
- if x in range(1,319) and y in range(1,575):
179
- bbox_points.append([x,y])
180
-
181
- if motion_direction in ['l2r', 'r2l']:
182
- sorted_points = sorted(bbox_points, key=lambda x: x[1], reverse=motion_direction=="r2l")
183
- else:
184
- sorted_points = sorted(bbox_points, key=lambda x: x[0], reverse=motion_direction=="d2u")
185
- target_length = 24
186
- final_points = interpolate_points(np.array(sorted_points), target_length)
187
-
188
- # Remember to reverse the co-ordinates
189
- for i in range(NUM_FRAMES):
190
- x = int(final_points[i][0])
191
- y = int(final_points[i][1])
192
- # Added Padding
193
- final_canvas[i, max(int(x-bbox_size_x/2),16) // 8:min(int(x+bbox_size_x/2), 304)// 8,
194
- max(int(y-bbox_size_y/2),16)// 8:min(int(y+bbox_size_y/2),560)// 8] = 1
195
-
196
-
197
- torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
198
- fg_masks = final_canvas.unsqueeze(1).to(torch_device)
199
- # # Save fg_masks as images
200
- for i in range(NUM_FRAMES):
201
- cv2.imwrite(f"./fg_masks/frame_{i:04d}.png", fg_masks[i,0].cpu().numpy()*255)
202
-
203
- seed = 2
204
- random_latents = torch.randn([1, 4, NUM_FRAMES, 40, 72], generator=torch.Generator().manual_seed(seed)).to(torch_device)
205
- overall_prompt = f"A realistic lively {prompt}"
206
- video_frames = generate_video(pipe, overall_prompt, random_latents, get_latents=False, num_frames=NUM_FRAMES, num_inference_steps=40,
207
- fg_masks=fg_masks, fg_masked_latents=None, frozen_steps=2, frozen_prompt=None, fg_prompt=fg_object)
208
-
209
- return create_video(video_frames,fps=8, type="final")
210
-
211
-
212
-
213
- demo = gr.Interface(
214
- fn=generate_bb,
215
- inputs=["text", "text", gr.Radio(choices=["square", "horizontal", "vertical"]), gr.Radio(choices=["small", "medium", "large"]), gr.Radio(choices=["l2r", "r2l", "u2d", "d2u"]),
216
- gr.Paint(value={'background':np.zeros((320,576)), 'layers': [], 'composite': np.zeros((320,576))},type="numpy", image_mode="RGB", height=320, width=576)],
217
- outputs=gr.Video(),
218
- )
219
-
220
-
221
- if __name__ == "__main__":
222
- demo.launch(share=True)