Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -2,55 +2,31 @@ import os
|
|
2 |
import uuid
|
3 |
from omegaconf import OmegaConf
|
4 |
import spaces
|
5 |
-
|
6 |
import random
|
7 |
-
|
8 |
import imageio
|
9 |
import torch
|
10 |
import torchvision
|
11 |
import gradio as gr
|
12 |
import numpy as np
|
13 |
-
|
|
|
14 |
from gradio.components import Textbox, Video
|
15 |
from huggingface_hub import hf_hub_download
|
16 |
-
|
17 |
from utils.common_utils import load_model_checkpoint
|
18 |
from utils.utils import instantiate_from_config
|
19 |
from scheduler.t2v_turbo_scheduler import T2VTurboScheduler
|
20 |
from pipeline.t2v_turbo_vc2_pipeline import T2VTurboVC2Pipeline
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
Our model is distilled from [VideoCrafter2](https://ailab-cvc.github.io/videocrafter2/).
|
25 |
-
|
26 |
-
T2V-Turbo learns a LoRA on top of the base model by aligning to the reward feedback from [HPSv2.1](https://github.com/tgxs002/HPSv2/tree/master) and [InternVid2 Stage 2 Model](https://huggingface.co/OpenGVLab/InternVideo2-Stage2_1B-224p-f4).
|
27 |
-
|
28 |
-
T2V-Turbo-v2 optimizes the training techniques by finetuning the full base model and further aligns to [CLIPScore](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)
|
29 |
-
|
30 |
-
T2V-Turbo trains on pure WebVid-10M data, whereas T2V-Turbo-v2 carufully optimizes different learning objectives with a mixutre of VidGen-1M and WebVid-10M data.
|
31 |
-
|
32 |
-
Moreover, T2V-Turbo-v2 supports to distill motion priors from the training videos.
|
33 |
-
|
34 |
-
[Project page for T2V-Turbo](https://t2v-turbo.github.io) 🥳
|
35 |
-
|
36 |
-
[Project page for T2V-Turbo-v2](https://t2v-turbo-v2.github.io) 🤓
|
37 |
-
"""
|
38 |
-
if torch.cuda.is_available():
|
39 |
-
DESCRIPTION += "\n<p>Running on CUDA 😀</p>"
|
40 |
-
elif hasattr(torch, "xpu") and torch.xpu.is_available():
|
41 |
-
DESCRIPTION += "\n<p>Running on XPU 🤓</p>"
|
42 |
-
else:
|
43 |
-
DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
|
44 |
|
45 |
MAX_SEED = np.iinfo(np.int32).max
|
46 |
-
|
47 |
|
48 |
def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
|
49 |
if randomize_seed:
|
50 |
seed = random.randint(0, MAX_SEED)
|
51 |
return seed
|
52 |
|
53 |
-
|
54 |
def save_video(video_array, video_save_path, fps: int = 16):
|
55 |
video = video_array.detach().cpu()
|
56 |
video = torch.clamp(video.float(), -1.0, 1.0)
|
@@ -62,17 +38,7 @@ def save_video(video_array, video_save_path, fps: int = 16):
|
|
62 |
video_save_path, video, fps=fps, video_codec="h264", options={"crf": "10"}
|
63 |
)
|
64 |
|
65 |
-
example_txt
|
66 |
-
"An astronaut riding a horse.",
|
67 |
-
"Darth vader surfing in waves.",
|
68 |
-
"light wind, feathers moving, she moves her gaze, 4k",
|
69 |
-
"a girl floating underwater.",
|
70 |
-
"Pikachu snowboarding.",
|
71 |
-
"Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
|
72 |
-
"A musician strums his guitar, serenading the moonlit night.",
|
73 |
-
]
|
74 |
-
|
75 |
-
examples = [[i, 7.5, 0.5, 16, 16, 0, True, "bf16"] for i in example_txt]
|
76 |
|
77 |
@spaces.GPU(duration=120)
|
78 |
@torch.inference_mode()
|
@@ -87,8 +53,8 @@ def generate(
|
|
87 |
param_dtype="bf16",
|
88 |
motion_gs: float = 0.05,
|
89 |
fps: int = 8,
|
|
|
90 |
):
|
91 |
-
|
92 |
seed = randomize_seed_fn(seed, randomize_seed)
|
93 |
torch.manual_seed(seed)
|
94 |
|
@@ -123,26 +89,70 @@ def generate(
|
|
123 |
)
|
124 |
|
125 |
torch.cuda.empty_cache()
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
root_path = "./videos/"
|
128 |
os.makedirs(root_path, exist_ok=True)
|
129 |
-
video_save_path = os.path.join(root_path,
|
130 |
|
131 |
save_video(result[0], video_save_path, fps=fps)
|
132 |
display_model_info = f"Video size: {num_frames}x320x512, Sampling Step: {num_inference_steps}, Guidance Scale: {guidance_scale}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
return video_save_path, prompt, display_model_info, seed
|
134 |
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
if __name__ == "__main__":
|
144 |
device = torch.device("cuda:0")
|
145 |
|
|
|
146 |
config = OmegaConf.load("configs/inference_t2v_512_v2.0.yaml")
|
147 |
model_config = config.pop("model", OmegaConf.create())
|
148 |
pretrained_t2v = instantiate_from_config(model_config)
|
@@ -169,54 +179,18 @@ if __name__ == "__main__":
|
|
169 |
pipeline = T2VTurboVC2Pipeline(pretrained_t2v, scheduler, model_config)
|
170 |
pipeline.to(device)
|
171 |
|
|
|
172 |
demo = gr.Interface(
|
173 |
-
fn=generate,
|
174 |
inputs=[
|
175 |
Textbox(label="", placeholder="Please enter your prompt. \n"),
|
176 |
-
gr.Slider(
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
value=7.5,
|
182 |
-
),
|
183 |
-
gr.Slider(
|
184 |
-
label="Percentage of steps to apply motion guidance (v2 w/ MG only)",
|
185 |
-
minimum=0.0,
|
186 |
-
maximum=0.5,
|
187 |
-
step=0.05,
|
188 |
-
value=0.5,
|
189 |
-
),
|
190 |
-
gr.Slider(
|
191 |
-
label="Number of inference steps",
|
192 |
-
minimum=4,
|
193 |
-
maximum=50,
|
194 |
-
step=1,
|
195 |
-
value=16,
|
196 |
-
),
|
197 |
-
gr.Slider(
|
198 |
-
label="Number of Video Frames",
|
199 |
-
minimum=16,
|
200 |
-
maximum=48,
|
201 |
-
step=8,
|
202 |
-
value=16,
|
203 |
-
),
|
204 |
-
gr.Slider(
|
205 |
-
label="Seed",
|
206 |
-
minimum=0,
|
207 |
-
maximum=MAX_SEED,
|
208 |
-
step=1,
|
209 |
-
value=0,
|
210 |
-
randomize=True,
|
211 |
-
),
|
212 |
gr.Checkbox(label="Randomize seed", value=True),
|
213 |
-
gr.Radio(
|
214 |
-
["bf16", "fp16", "fp32"],
|
215 |
-
label="torch.dtype",
|
216 |
-
value="bf16",
|
217 |
-
interactive=True,
|
218 |
-
info="Dtype for inference. Default is bf16.",
|
219 |
-
)
|
220 |
],
|
221 |
outputs=[
|
222 |
gr.Video(label="Generated Video", width=512, height=320, interactive=False, autoplay=True),
|
@@ -231,4 +205,9 @@ if __name__ == "__main__":
|
|
231 |
cache_examples=False,
|
232 |
concurrency_limit=10,
|
233 |
)
|
234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import uuid
|
3 |
from omegaconf import OmegaConf
|
4 |
import spaces
|
|
|
5 |
import random
|
|
|
6 |
import imageio
|
7 |
import torch
|
8 |
import torchvision
|
9 |
import gradio as gr
|
10 |
import numpy as np
|
11 |
+
from fastapi import FastAPI
|
12 |
+
from fastapi.responses import FileResponse
|
13 |
from gradio.components import Textbox, Video
|
14 |
from huggingface_hub import hf_hub_download
|
|
|
15 |
from utils.common_utils import load_model_checkpoint
|
16 |
from utils.utils import instantiate_from_config
|
17 |
from scheduler.t2v_turbo_scheduler import T2VTurboScheduler
|
18 |
from pipeline.t2v_turbo_vc2_pipeline import T2VTurboVC2Pipeline
|
19 |
|
20 |
+
# Keep all your original constants and DESCRIPTION
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
MAX_SEED = np.iinfo(np.int32).max
|
23 |
+
app = FastAPI()
|
24 |
|
25 |
def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
|
26 |
if randomize_seed:
|
27 |
seed = random.randint(0, MAX_SEED)
|
28 |
return seed
|
29 |
|
|
|
30 |
def save_video(video_array, video_save_path, fps: int = 16):
|
31 |
video = video_array.detach().cpu()
|
32 |
video = torch.clamp(video.float(), -1.0, 1.0)
|
|
|
38 |
video_save_path, video, fps=fps, video_codec="h264", options={"crf": "10"}
|
39 |
)
|
40 |
|
41 |
+
# Keep your original example_txt and examples
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
@spaces.GPU(duration=120)
|
44 |
@torch.inference_mode()
|
|
|
53 |
param_dtype="bf16",
|
54 |
motion_gs: float = 0.05,
|
55 |
fps: int = 8,
|
56 |
+
is_api: bool = False, # New parameter to handle API calls
|
57 |
):
|
|
|
58 |
seed = randomize_seed_fn(seed, randomize_seed)
|
59 |
torch.manual_seed(seed)
|
60 |
|
|
|
89 |
)
|
90 |
|
91 |
torch.cuda.empty_cache()
|
92 |
+
|
93 |
+
# Generate unique filename for API calls
|
94 |
+
if is_api:
|
95 |
+
video_filename = f"{uuid.uuid4()}.mp4"
|
96 |
+
else:
|
97 |
+
video_filename = "tmp.mp4"
|
98 |
+
|
99 |
root_path = "./videos/"
|
100 |
os.makedirs(root_path, exist_ok=True)
|
101 |
+
video_save_path = os.path.join(root_path, video_filename)
|
102 |
|
103 |
save_video(result[0], video_save_path, fps=fps)
|
104 |
display_model_info = f"Video size: {num_frames}x320x512, Sampling Step: {num_inference_steps}, Guidance Scale: {guidance_scale}"
|
105 |
+
|
106 |
+
if is_api:
|
107 |
+
return {
|
108 |
+
"video_path": video_save_path,
|
109 |
+
"prompt": prompt,
|
110 |
+
"model_info": display_model_info,
|
111 |
+
"seed": seed
|
112 |
+
}
|
113 |
return video_save_path, prompt, display_model_info, seed
|
114 |
|
115 |
+
# API endpoint
|
116 |
+
@app.post("/generate")
|
117 |
+
async def generate_api(
|
118 |
+
prompt: str,
|
119 |
+
guidance_scale: float = 7.5,
|
120 |
+
percentage: float = 0.5,
|
121 |
+
num_inference_steps: int = 4,
|
122 |
+
num_frames: int = 16,
|
123 |
+
seed: int = 0,
|
124 |
+
randomize_seed: bool = False,
|
125 |
+
param_dtype: str = "bf16",
|
126 |
+
motion_gs: float = 0.05,
|
127 |
+
fps: int = 8,
|
128 |
+
):
|
129 |
+
result = generate(
|
130 |
+
prompt=prompt,
|
131 |
+
guidance_scale=guidance_scale,
|
132 |
+
percentage=percentage,
|
133 |
+
num_inference_steps=num_inference_steps,
|
134 |
+
num_frames=num_frames,
|
135 |
+
seed=seed,
|
136 |
+
randomize_seed=randomize_seed,
|
137 |
+
param_dtype=param_dtype,
|
138 |
+
motion_gs=motion_gs,
|
139 |
+
fps=fps,
|
140 |
+
is_api=True
|
141 |
+
)
|
142 |
+
|
143 |
+
return FileResponse(
|
144 |
+
result["video_path"],
|
145 |
+
media_type="video/mp4",
|
146 |
+
headers={
|
147 |
+
"X-Model-Info": result["model_info"],
|
148 |
+
"X-Seed": str(result["seed"])
|
149 |
+
}
|
150 |
+
)
|
151 |
|
152 |
if __name__ == "__main__":
|
153 |
device = torch.device("cuda:0")
|
154 |
|
155 |
+
# Keep all your original model initialization code
|
156 |
config = OmegaConf.load("configs/inference_t2v_512_v2.0.yaml")
|
157 |
model_config = config.pop("model", OmegaConf.create())
|
158 |
pretrained_t2v = instantiate_from_config(model_config)
|
|
|
179 |
pipeline = T2VTurboVC2Pipeline(pretrained_t2v, scheduler, model_config)
|
180 |
pipeline.to(device)
|
181 |
|
182 |
+
# Mount both Gradio and FastAPI
|
183 |
demo = gr.Interface(
|
184 |
+
fn=lambda *args: generate(*args, is_api=False),
|
185 |
inputs=[
|
186 |
Textbox(label="", placeholder="Please enter your prompt. \n"),
|
187 |
+
gr.Slider(label="Guidance scale", minimum=2, maximum=14, step=0.1, value=7.5),
|
188 |
+
gr.Slider(label="Percentage of steps to apply motion guidance", minimum=0.0, maximum=0.5, step=0.05, value=0.5),
|
189 |
+
gr.Slider(label="Number of inference steps", minimum=4, maximum=50, step=1, value=16),
|
190 |
+
gr.Slider(label="Number of Video Frames", minimum=16, maximum=48, step=8, value=16),
|
191 |
+
gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0, randomize=True),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
gr.Checkbox(label="Randomize seed", value=True),
|
193 |
+
gr.Radio(["bf16", "fp16", "fp32"], label="torch.dtype", value="bf16", interactive=True),
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
],
|
195 |
outputs=[
|
196 |
gr.Video(label="Generated Video", width=512, height=320, interactive=False, autoplay=True),
|
|
|
205 |
cache_examples=False,
|
206 |
concurrency_limit=10,
|
207 |
)
|
208 |
+
|
209 |
+
app = gr.mount_gradio_app(app, demo, path="/")
|
210 |
+
|
211 |
+
# Run both servers
|
212 |
+
import uvicorn
|
213 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|