Spaces:
Runtime error
Runtime error
# All rights reserved. | |
# This source code is licensed under the license found in the | |
# LICENSE file in the root directory of this source tree. | |
""" | |
Sample new images from a pre-trained Latte. | |
""" | |
import os | |
import sys | |
from accelerate import Accelerator | |
from tqdm import tqdm | |
from opensora.dataset import ae_denorm | |
from opensora.models.ae import ae_channel_config, getae, ae_stride_config | |
from opensora.models.ae.videobase import CausalVQVAEModelWrapper | |
from opensora.models.diffusion import Diffusion_models | |
from opensora.models.diffusion.diffusion import create_diffusion_T as create_diffusion | |
from opensora.models.diffusion.latte.modeling_latte import Latte | |
from opensora.utils.utils import find_model | |
import torch | |
import argparse | |
from einops import rearrange | |
import imageio | |
torch.backends.cuda.matmul.allow_tf32 = True | |
torch.backends.cudnn.allow_tf32 = True | |
def main(args): | |
# Setup PyTorch: | |
# torch.manual_seed(args.seed) | |
torch.set_grad_enabled(False) | |
assert torch.cuda.is_available(), "Training currently requires at least one GPU." | |
# Setup accelerator: | |
accelerator = Accelerator(mixed_precision=args.mixed_precision) | |
device = accelerator.device | |
using_cfg = args.cfg_scale > 1.0 | |
# Load model: | |
latent_size = (args.image_size // ae_stride_config[args.ae][1], args.image_size // ae_stride_config[args.ae][2]) | |
args.latent_size = latent_size | |
model = Latte.from_pretrained(args.ckpt, subfolder="model").to(device) | |
model.eval() # important! | |
model = accelerator.prepare(model) | |
diffusion = create_diffusion(str(args.num_sampling_steps)) | |
ae = getae(args).to(device) | |
if isinstance(ae, CausalVQVAEModelWrapper): | |
video_length = args.num_frames // ae_stride_config[args.ae][0] + 1 | |
else: | |
video_length = args.num_frames // ae_stride_config[args.ae][0] | |
bar = tqdm(range(args.num_sample)) | |
for i in bar: | |
# Create sampling noise: | |
z = torch.randn(1, model.module.in_channels, video_length, latent_size[0], latent_size[1], device=device) | |
# Setup classifier-free guidance: | |
if using_cfg and args.train_classcondition: | |
z = torch.cat([z, z], 0) | |
y = torch.randint(0, args.num_classes, (1,), device=device) | |
cls_id = str(int(y.detach().cpu())) | |
y_null = torch.tensor([args.num_classes] * 1, device=device) | |
y = torch.cat([y, y_null], dim=0) | |
model_kwargs = dict(class_labels=y, cfg_scale=args.cfg_scale) | |
sample_fn = model.module.forward_with_cfg | |
else: | |
if args.train_classcondition: | |
sample_fn = model.forward | |
y = torch.randint(0, args.num_classes, (1,), device=device) | |
cls_id = str(int(y.detach().cpu())) | |
model_kwargs = dict(class_labels=y) | |
else: | |
sample_fn = model.forward | |
model_kwargs = dict(class_labels=None) | |
# Sample images: | |
if args.sample_method == 'ddim': | |
samples = diffusion.ddim_sample_loop( | |
sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device | |
) | |
elif args.sample_method == 'ddpm': | |
samples = diffusion.p_sample_loop( | |
sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device | |
) | |
with torch.no_grad(): | |
samples = ae.decode(samples) | |
# Save and display images: | |
if not os.path.exists(args.save_video_path): | |
os.makedirs(args.save_video_path) | |
video_ = (ae_denorm[args.ae](samples[0]) * 255).add_(0.5).clamp_(0, 255).to(dtype=torch.uint8).cpu().permute(0, 2, 3, 1).contiguous() | |
if args.train_classcondition: | |
video_save_path = os.path.join(args.save_video_path, f"sample_{i:03d}_cls" + str(cls_id) + '.mp4') | |
else: | |
video_save_path = os.path.join(args.save_video_path, f"sample_{i:03d}" + '.mp4') | |
print(video_save_path) | |
imageio.mimwrite(video_save_path, video_, fps=args.fps, quality=9) | |
print('save path {}'.format(args.save_video_path)) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--ckpt", type=str, default="") | |
parser.add_argument("--model", type=str, default='Latte-XL/122') | |
parser.add_argument("--ae", type=str, default='stabilityai/sd-vae-ft-mse') | |
parser.add_argument("--save_video_path", type=str, default="./sample_videos/") | |
parser.add_argument("--fps", type=int, default=10) | |
parser.add_argument("--num_classes", type=int, default=101) | |
parser.add_argument("--num_frames", type=int, default=16) | |
parser.add_argument("--image_size", type=int, default=256) | |
parser.add_argument("--train_classcondition", action="store_true") | |
parser.add_argument("--num_sampling_steps", type=int, default=250) | |
parser.add_argument("--num_sample", type=int, default=1) | |
parser.add_argument("--cfg_scale", type=float, default=1.0) | |
parser.add_argument("--sample_method", type=str, default='ddpm') | |
parser.add_argument("--mixed_precision", type=str, default=None, choices=[None, "fp16", "bf16"]) | |
parser.add_argument("--attention_mode", type=str, choices=['xformers', 'math', 'flash'], default="math") | |
args = parser.parse_args() | |
main(args) | |