Spaces:

wxDai
/

MotionLCM

Running

App Files Files Community

wxDai commited on 10 days ago

Commit

c64dfa4

•

0 Parent(s):

[Init]

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +12 -0
LICENSE +25 -0
README.md +11 -0
app.py +234 -0
configs/mld_t2m.yaml +104 -0
configs/modules/denoiser.yaml +28 -0
configs/modules/motion_vae.yaml +18 -0
configs/modules/noise_optimizer.yaml +15 -0
configs/modules/scheduler_ddim.yaml +14 -0
configs/modules/scheduler_lcm.yaml +19 -0
configs/modules/text_encoder.yaml +5 -0
configs/modules/traj_encoder.yaml +17 -0
configs/motionlcm_control_s.yaml +113 -0
configs/motionlcm_control_t.yaml +111 -0
configs/motionlcm_t2m.yaml +109 -0
configs/motionlcm_t2m_clt.yaml +69 -0
configs/vae.yaml +103 -0
configs_v1/modules/denoiser.yaml +28 -0
configs_v1/modules/motion_vae.yaml +18 -0
configs_v1/modules/scheduler_lcm.yaml +11 -0
configs_v1/modules/text_encoder.yaml +5 -0
configs_v1/modules/traj_encoder.yaml +17 -0
configs_v1/motionlcm_control_t.yaml +114 -0
configs_v1/motionlcm_t2m.yaml +109 -0
demo.py +196 -0
fit.py +136 -0
mld/__init__.py +0 -0
mld/config.py +52 -0
mld/data/__init__.py +0 -0
mld/data/base.py +58 -0
mld/data/data.py +73 -0
mld/data/get_data.py +79 -0
mld/data/humanml/__init__.py +0 -0
mld/data/humanml/common/quaternion.py +29 -0
mld/data/humanml/dataset.py +348 -0
mld/data/humanml/scripts/motion_process.py +51 -0
mld/data/humanml/utils/__init__.py +0 -0
mld/data/humanml/utils/paramUtil.py +62 -0
mld/data/humanml/utils/plot_script.py +98 -0
mld/data/humanml/utils/word_vectorizer.py +82 -0
mld/data/utils.py +52 -0
mld/launch/__init__.py +0 -0
mld/launch/blender.py +23 -0
mld/models/__init__.py +0 -0
mld/models/architectures/__init__.py +0 -0
mld/models/architectures/dno.py +79 -0
mld/models/architectures/mld_clip.py +72 -0
mld/models/architectures/mld_denoiser.py +200 -0
mld/models/architectures/mld_traj_encoder.py +64 -0
mld/models/architectures/mld_vae.py +136 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+**/*.pyc
+.idea/
+__pycache__/
+deps/
+datasets/
+experiments_t2m/
+experiments_t2m_test/
+experiments_control/
+experiments_control_test/
+experiments_recons/
+experiments_recons_test/

LICENSE ADDED Viewed

	@@ -0,0 +1,25 @@

+Copyright Tsinghua University and Shanghai AI Laboratory. All Rights Reserved.
+License for Non-commercial Scientific Research Purposes.
+For more information see <https://github.com/Dai-Wenxun/MotionLCM>.
+If you use this software, please cite the corresponding publications
+listed on the above website.
+Permission to use, copy, modify, and distribute this software and its
+documentation for educational, research, and non-profit purposes only.
+Any modification based on this work must be open-source and prohibited
+for commercial, pornographic, military, or surveillance use.
+The authors grant you a non-exclusive, worldwide, non-transferable,
+non-sublicensable, revocable, royalty-free, and limited license under
+our copyright interests to reproduce, distribute, and create derivative
+works of the text, videos, and codes solely for your non-commercial
+research purposes.
+You must retain, in the source form of any derivative works that you
+distribute, all copyright, patent, trademark, and attribution notices
+from the source form of this work.
+For commercial uses of this software, please send email to all people
+in the author list.

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: MotionLCM
+emoji: 🏎️💨
+colorFrom: yellow
+colorTo: pink
+sdk: gradio
+sdk_version: 4.44.1
+app_file: app.py
+pinned: false
+python_version: 3.10.12
+---

app.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import os
+import time
+import random
+import datetime
+import os.path as osp
+from functools import partial
+import tqdm
+from omegaconf import OmegaConf
+import torch
+import gradio as gr
+from mld.config import get_module_config
+from mld.data.get_data import get_dataset
+from mld.models.modeltype.mld import MLD
+from mld.utils.utils import set_seed
+from mld.data.humanml.utils.plot_script import plot_3d_motion
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+WEBSITE = """
+<div class="embed_hidden">
+<h1 style='text-align: center'> MotionLCM: Real-time Controllable Motion Generation via Latent Consistency Model </h1>
+<h2 style='text-align: center'>
+<a href="https://github.com/Dai-Wenxun/" target="_blank"><nobr>Wenxun Dai</nobr><sup>1</sup></a> &emsp;
+<a href="https://lhchen.top/" target="_blank"><nobr>Ling-Hao Chen</nobr></a><sup>1</sup> &emsp;
+<a href="https://wangjingbo1219.github.io/" target="_blank"><nobr>Jingbo Wang</nobr></a><sup>2</sup> &emsp;
+<a href="https://moonsliu.github.io/" target="_blank"><nobr>Jinpeng Liu</nobr></a><sup>1</sup> &emsp;
+<a href="https://daibo.info/" target="_blank"><nobr>Bo Dai</nobr></a><sup>2</sup> &emsp;
+<a href="https://andytang15.github.io/" target="_blank"><nobr>Yansong Tang</nobr></a><sup>1</sup>
+</h2>
+<h2 style='text-align: center'>
+<nobr><sup>1</sup>Tsinghua University</nobr> &emsp;
+<nobr><sup>2</sup>Shanghai AI Laboratory</nobr>
+</h2>
+</div>
+"""
+WEBSITE_bottom = """
+<div class="embed_hidden">
+<p>
+Space adapted from <a href="https://huggingface.co/spaces/Mathux/TMR" target="_blank">TMR</a>
+and <a href="https://huggingface.co/spaces/MeYourHint/MoMask" target="_blank">MoMask</a>.
+</p>
+</div>
+"""
+EXAMPLES = [
+    "a person does a jump",
+    "a person waves both arms in the air.",
+    "The person takes 4 steps backwards.",
+    "this person bends forward as if to bow.",
+    "The person was pushed but did not fall.",
+    "a man walks forward in a snake like pattern.",
+    "a man paces back and forth along the same line.",
+    "with arms out to the sides a person walks forward",
+    "A man bends down and picks something up with his right hand.",
+    "The man walked forward, spun right on one foot and walked back to his original position.",
+    "a person slightly bent over with right hand pressing against the air walks forward slowly"
+]
+if not os.path.exists("./experiments_t2m/"):
+    os.system("bash prepare/download_pretrained_models.sh")
+if not os.path.exists('./deps/glove/'):
+    os.system("bash prepare/download_glove.sh")
+if not os.path.exists('./deps/sentence-t5-large/'):
+    os.system("bash prepare/prepare_t5.sh")
+if not os.path.exists('./deps/t2m/'):
+    os.system("bash prepare/download_t2m_evaluators.sh")
+if not os.path.exists('./datasets/humanml3d/'):
+    os.system("bash prepare/prepare_tiny_humanml3d.sh")
+DEFAULT_TEXT = "A person is "
+MAX_VIDEOS = 8
+NUM_ROWS = 2
+NUM_COLS = MAX_VIDEOS // NUM_ROWS
+EXAMPLES_PER_PAGE = 12
+T2M_CFG = "./configs_v1/motionlcm_t2m.yaml"
+device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+print("device: ", device)
+cfg = OmegaConf.load(T2M_CFG)
+cfg_root = os.path.dirname(T2M_CFG)
+cfg_model = get_module_config(cfg.model, cfg.model.target, cfg_root)
+cfg = OmegaConf.merge(cfg, cfg_model)
+set_seed(cfg.SEED_VALUE)
+name_time_str = osp.join(cfg.NAME, datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S"))
+cfg.output_dir = osp.join(cfg.TEST_FOLDER, name_time_str)
+vis_dir = osp.join(cfg.output_dir, 'samples')
+os.makedirs(cfg.output_dir, exist_ok=False)
+os.makedirs(vis_dir, exist_ok=False)
+state_dict = torch.load(cfg.TEST.CHECKPOINTS, map_location="cpu")["state_dict"]
+print("Loading checkpoints from {}".format(cfg.TEST.CHECKPOINTS))
+is_lcm = False
+lcm_key = 'denoiser.time_embedding.cond_proj.weight'  # unique key for CFG
+if lcm_key in state_dict:
+    is_lcm = True
+    time_cond_proj_dim = state_dict[lcm_key].shape[1]
+    cfg.model.denoiser.params.time_cond_proj_dim = time_cond_proj_dim
+print(f'Is LCM: {is_lcm}')
+dataset = get_dataset(cfg)
+model = MLD(cfg, dataset)
+model.to(device)
+model.eval()
+model.requires_grad_(False)
+model.load_state_dict(state_dict)
+FPS = eval(f"cfg.DATASET.{cfg.DATASET.NAME.upper()}.FRAME_RATE")
+@torch.no_grad()
+def generate(text_, motion_len_):
+    batch = {"text": [text_] * MAX_VIDEOS, "length": [motion_len_] * MAX_VIDEOS}
+    s = time.time()
+    joints = model(batch)[0]
+    runtime_infer = round(time.time() - s, 3)
+    s = time.time()
+    path = []
+    for i in tqdm.tqdm(range(len(joints))):
+        uid = random.randrange(999999999)
+        video_path = osp.join(vis_dir, f"sample_{uid}.mp4")
+        plot_3d_motion(video_path, joints[i].detach().cpu().numpy(), '', fps=FPS)
+        path.append(video_path)
+    runtime_draw = round(time.time() - s, 3)
+    runtime_info = f'Inference {len(joints)} motions, Runtime (Inference): {runtime_infer}s, ' \
+                   f'Runtime (Draw Skeleton): {runtime_draw}s, device: {device} '
+    return path, runtime_info
+def generate_component(generate_function, text_, motion_len_, num_inference_steps_, guidance_scale_):
+    if text_ == DEFAULT_TEXT or text_ == "" or text_ is None:
+        return [None] * MAX_VIDEOS + ["Please modify the default text prompt."]
+    model.cfg.model.scheduler.num_inference_steps = num_inference_steps_
+    model.guidance_scale = guidance_scale_
+    motion_len_ = max(36, min(int(float(motion_len_) * FPS), 196))
+    paths, info = generate_function(text_, motion_len_)
+    paths = paths + [None] * (MAX_VIDEOS - len(paths))
+    return paths + [info]
+theme = gr.themes.Default(primary_hue="purple", secondary_hue="gray")
+generate_and_show = partial(generate_component, generate)
+with gr.Blocks(theme=theme) as demo:
+    gr.HTML(WEBSITE)
+    videos = []
+    with gr.Row():
+        with gr.Column(scale=3):
+            text = gr.Textbox(
+                show_label=True,
+                label="Text prompt",
+                value=DEFAULT_TEXT,
+            )
+            with gr.Row():
+                with gr.Column(scale=1):
+                    motion_len = gr.Slider(
+                        minimum=1.8,
+                        maximum=9.8,
+                        step=0.2,
+                        value=5.0,
+                        label="Motion length",
+                        info="Motion duration in seconds: [1.8s, 9.8s] (FPS = 20)."
+                    )
+                with gr.Column(scale=1):
+                    num_inference_steps = gr.Slider(
+                        minimum=1,
+                        maximum=4,
+                        step=1,
+                        value=1,
+                        label="Inference steps",
+                        info="Number of inference steps.",
+                    )
+            cfg = gr.Slider(
+                minimum=1,
+                maximum=15,
+                step=0.5,
+                value=7.5,
+                label="CFG",
+                info="Classifier-free diffusion guidance.",
+            )
+            gen_btn = gr.Button("Generate", variant="primary")
+            clear = gr.Button("Clear", variant="secondary")
+            results = gr.Textbox(show_label=True,
+                                 label='Inference info (runtime and device)',
+                                 info='Real-time inference cannot be achieved using the free CPU. Local GPU deployment is recommended.',
+                                 interactive=False)
+        with gr.Column(scale=2):
+            examples = gr.Examples(
+                examples=EXAMPLES,
+                inputs=[text],
+                examples_per_page=EXAMPLES_PER_PAGE)
+    for i in range(NUM_ROWS):
+        with gr.Row():
+            for j in range(NUM_COLS):
+                video = gr.Video(autoplay=True, loop=True)
+                videos.append(video)
+    # gr.HTML(WEBSITE_bottom)
+    gen_btn.click(
+        fn=generate_and_show,
+        inputs=[text, motion_len, num_inference_steps, cfg],
+        outputs=videos+[results],
+    )
+    text.submit(
+        fn=generate_and_show,
+        inputs=[text, motion_len, num_inference_steps, cfg],
+        outputs=videos+[results],
+    )
+    def clear_videos():
+        return [None] * MAX_VIDEOS + [DEFAULT_TEXT] + [None]
+    clear.click(fn=clear_videos, outputs=videos + [text] + [results])
+demo.launch()

configs/mld_t2m.yaml ADDED Viewed

	@@ -0,0 +1,104 @@

+FOLDER: './experiments_t2m'
+TEST_FOLDER: './experiments_t2m_test'
+NAME: 'mld_humanml'
+SEED_VALUE: 1234
+TRAIN:
+  BATCH_SIZE: 64
+  SPLIT: 'train'
+  NUM_WORKERS: 8
+  PERSISTENT_WORKERS: true
+  PRETRAINED: 'experiments_recons/vae_humanml/vae_humanml.ckpt'
+  validation_steps: -1
+  validation_epochs: 50
+  checkpointing_steps: -1
+  checkpointing_epochs: 50
+  max_train_steps: -1
+  max_train_epochs: 3000
+  learning_rate: 1e-4
+  lr_scheduler: "cosine"
+  lr_warmup_steps: 1000
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay: 0.0
+  adam_epsilon: 1e-08
+  max_grad_norm: 1.0
+  model_ema: false
+  model_ema_steps: 32
+  model_ema_decay: 0.999
+VAL:
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  PERSISTENT_WORKERS: true
+TEST:
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  PERSISTENT_WORKERS: true
+  CHECKPOINTS: 'experiments_t2m/mld_humanml/mld_humanml.ckpt'
+  # Testing Args
+  REPLICATION_TIMES: 20
+  MM_NUM_SAMPLES: 100
+  MM_NUM_REPEATS: 30
+  MM_NUM_TIMES: 10
+  DIVERSITY_TIMES: 300
+  DO_MM_TEST: true
+DATASET:
+  NAME: 'humanml3d'
+  SMPL_PATH: './deps/smpl'
+  WORD_VERTILIZER_PATH: './deps/glove/'
+  HUMANML3D:
+    FRAME_RATE: 20.0
+    UNIT_LEN: 4
+    ROOT: './datasets/humanml3d'
+    CONTROL_ARGS:
+      CONTROL: false
+      TEMPORAL: false
+      TRAIN_JOINTS: [0]
+      TEST_JOINTS: [0]
+      TRAIN_DENSITY: 'random'
+      TEST_DENSITY: 100
+      MEAN_STD_PATH: './datasets/humanml_spatial_norm'
+  SAMPLER:
+    MAX_LEN: 200
+    MIN_LEN: 40
+    MAX_TEXT_LEN: 20
+  PADDING_TO_MAX: false
+  WINDOW_SIZE: null
+METRIC:
+  DIST_SYNC_ON_STEP: true
+  TYPE: ['TM2TMetrics']
+model:
+  target: ['motion_vae', 'text_encoder', 'denoiser', 'scheduler_ddim', 'noise_optimizer']
+  latent_dim: [16, 32]
+  guidance_scale: 7.5
+  guidance_uncondp: 0.1
+  t2m_textencoder:
+    dim_word: 300
+    dim_pos_ohot: 15
+    dim_text_hidden: 512
+    dim_coemb_hidden: 512
+  t2m_motionencoder:
+    dim_move_hidden: 512
+    dim_move_latent: 512
+    dim_motion_hidden: 1024
+    dim_motion_latent: 512
+  bert_path: './deps/distilbert-base-uncased'
+  clip_path: './deps/clip-vit-large-patch14'
+  t5_path: './deps/sentence-t5-large'
+  t2m_path: './deps/t2m/'

configs/modules/denoiser.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+denoiser:
+  target: mld.models.architectures.mld_denoiser.MldDenoiser
+  params:
+    latent_dim: ${model.latent_dim}
+    hidden_dim: 256
+    text_dim: 768
+    time_dim: 768
+    ff_size: 1024
+    num_layers: 9
+    num_heads: 4
+    dropout: 0.1
+    normalize_before: false
+    norm_eps: 1e-5
+    activation: 'gelu'
+    norm_post: true
+    activation_post: null
+    flip_sin_to_cos: true
+    freq_shift: 0
+    time_act_fn: 'silu'
+    time_post_act_fn: null
+    position_embedding: 'learned'
+    arch: 'trans_enc'
+    add_mem_pos: true
+    force_pre_post_proj: true
+    text_act_fn: null
+    zero_init_cond: true
+    controlnet_embed_dim: 256
+    controlnet_act_fn: 'silu'

configs/modules/motion_vae.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+motion_vae:
+  target: mld.models.architectures.mld_vae.MldVae
+  params:
+    nfeats: ${DATASET.NFEATS}
+    latent_dim: ${model.latent_dim}
+    hidden_dim: 256
+    force_pre_post_proj: true
+    ff_size: 1024
+    num_layers: 9
+    num_heads: 4
+    dropout: 0.1
+    arch: 'encoder_decoder'
+    normalize_before: false
+    norm_eps: 1e-5
+    activation: 'gelu'
+    norm_post: true
+    activation_post: null
+    position_embedding: 'learned'

configs/modules/noise_optimizer.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+noise_optimizer:
+  target: mld.models.architectures.dno.DNO
+  params:
+    optimize: false
+    max_train_steps: 400
+    learning_rate: 0.1
+    lr_scheduler: 'cosine'
+    lr_warmup_steps: 50
+    clip_grad: true
+    loss_hint_type: 'l2'
+    loss_diff_penalty: 0.000
+    loss_correlate_penalty: 100
+    visualize_samples: 0
+    visualize_ske_steps: []
+    output_dir: ${output_dir}

configs/modules/scheduler_ddim.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+scheduler:
+  target: diffusers.DDIMScheduler
+  num_inference_steps: 50
+  eta: 0.0
+  params:
+    num_train_timesteps: 1000
+    beta_start: 0.00085
+    beta_end: 0.012
+    beta_schedule: 'scaled_linear'
+    prediction_type: 'epsilon'
+    clip_sample: false
+    # below are for ddim
+    set_alpha_to_one: false
+    steps_offset: 1

configs/modules/scheduler_lcm.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+scheduler:
+  target: mld.models.schedulers.scheduling_lcm.LCMScheduler
+  num_inference_steps: 1
+  cfg_step_map:
+    1: 8.0
+    2: 12.5
+    4: 13.5
+  params:
+    num_train_timesteps: 1000
+    beta_start: 0.00085
+    beta_end: 0.012
+    beta_schedule: 'scaled_linear'
+    clip_sample: false
+    set_alpha_to_one: false
+    original_inference_steps: 10
+    timesteps_step_map:
+      1: [799]
+      2: [699, 299]
+      4: [699, 399, 299, 299]

configs/modules/text_encoder.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+text_encoder:
+  target: mld.models.architectures.mld_clip.MldTextEncoder
+  params:
+    last_hidden_state: false
+    modelpath: ${model.t5_path}

configs/modules/traj_encoder.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+traj_encoder:
+  target: mld.models.architectures.mld_traj_encoder.MldTrajEncoder
+  params:
+    nfeats: ${DATASET.NJOINTS}
+    latent_dim: ${model.latent_dim}
+    hidden_dim: 256
+    force_post_proj: true
+    ff_size: 1024
+    num_layers: 9
+    num_heads: 4
+    dropout: 0.1
+    normalize_before: false
+    norm_eps: 1e-5
+    activation: 'gelu'
+    norm_post: true
+    activation_post: null
+    position_embedding: 'learned'

configs/motionlcm_control_s.yaml ADDED Viewed

	@@ -0,0 +1,113 @@

+FOLDER: './experiments_control/spatial'
+TEST_FOLDER: './experiments_control_test/spatial'
+NAME: 'motionlcm_humanml'
+SEED_VALUE: 1234
+TRAIN:
+  DATASET: 'humanml3d'
+  BATCH_SIZE: 128
+  SPLIT: 'train'
+  NUM_WORKERS: 8
+  PERSISTENT_WORKERS: true
+  PRETRAINED: 'experiments_t2m/motionlcm_humanml/motionlcm_humanml.ckpt'
+  validation_steps: -1
+  validation_epochs: 50
+  checkpointing_steps: -1
+  checkpointing_epochs: 50
+  max_train_steps: -1
+  max_train_epochs: 1000
+  learning_rate: 1e-4
+  learning_rate_spatial: 1e-4
+  lr_scheduler: "cosine"
+  lr_warmup_steps: 1000
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay: 0.0
+  adam_epsilon: 1e-08
+  max_grad_norm: 1.0
+VAL:
+  DATASET: 'humanml3d'
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  PERSISTENT_WORKERS: true
+TEST:
+  DATASET: 'humanml3d'
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  PERSISTENT_WORKERS: true
+  CHECKPOINTS: 'experiments_control/spatial/motionlcm_humanml/motionlcm_humanml_s_pelvis.ckpt'
+  # CHECKPOINTS: 'experiments_control/spatial/motionlcm_humanml/motionlcm_humanml_s_all.ckpt'
+  # Testing Args
+  REPLICATION_TIMES: 1
+  DIVERSITY_TIMES: 300
+  DO_MM_TEST: false
+  MAX_NUM_SAMPLES: 1024
+DATASET:
+  NAME: 'humanml3d'
+  SMPL_PATH: './deps/smpl'
+  WORD_VERTILIZER_PATH: './deps/glove/'
+  HUMANML3D:
+    FRAME_RATE: 20.0
+    UNIT_LEN: 4
+    ROOT: './datasets/humanml3d'
+    CONTROL_ARGS:
+      CONTROL: true
+      TEMPORAL: false
+      TRAIN_JOINTS: [0]
+      TEST_JOINTS: [0]
+      TRAIN_DENSITY: 'random'
+      TEST_DENSITY: 100
+      MEAN_STD_PATH: './datasets/humanml_spatial_norm'
+  SAMPLER:
+    MAX_LEN: 200
+    MIN_LEN: 40
+    MAX_TEXT_LEN: 20
+  PADDING_TO_MAX: false
+  WINDOW_SIZE: null
+METRIC:
+  DIST_SYNC_ON_STEP: true
+  TYPE: ['TM2TMetrics', 'ControlMetrics']
+model:
+  target: ['motion_vae', 'text_encoder', 'denoiser', 'scheduler_lcm', 'traj_encoder', 'noise_optimizer']
+  latent_dim: [16, 32]
+  guidance_scale: 'dynamic'
+  # ControlNet Args
+  is_controlnet: true
+  vaeloss: true
+  vaeloss_type: 'mask'
+  cond_ratio: 1.0
+  control_loss_func: 'l1_smooth'
+  use_3d: true
+  lcm_w_min_nax: [5, 15]
+  lcm_num_ddim_timesteps: 10
+  t2m_textencoder:
+    dim_word: 300
+    dim_pos_ohot: 15
+    dim_text_hidden: 512
+    dim_coemb_hidden: 512
+  t2m_motionencoder:
+    dim_move_hidden: 512
+    dim_move_latent: 512
+    dim_motion_hidden: 1024
+    dim_motion_latent: 512
+  bert_path: './deps/distilbert-base-uncased'
+  clip_path: './deps/clip-vit-large-patch14'
+  t5_path: './deps/sentence-t5-large'
+  t2m_path: './deps/t2m/'

configs/motionlcm_control_t.yaml ADDED Viewed

	@@ -0,0 +1,111 @@

+FOLDER: './experiments_control/temporal'
+TEST_FOLDER: './experiments_control_test/temporal'
+NAME: 'motionlcm_humanml'
+SEED_VALUE: 1234
+TRAIN:
+  DATASET: 'humanml3d'
+  BATCH_SIZE: 128
+  SPLIT: 'train'
+  NUM_WORKERS: 8
+  PERSISTENT_WORKERS: true
+  PRETRAINED: 'experiments_t2m/motionlcm_humanml/motionlcm_humanml.ckpt'
+  validation_steps: -1
+  validation_epochs: 50
+  checkpointing_steps: -1
+  checkpointing_epochs: 50
+  max_train_steps: -1
+  max_train_epochs: 1000
+  learning_rate: 1e-4
+  learning_rate_spatial: 1e-4
+  lr_scheduler: "cosine"
+  lr_warmup_steps: 1000
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay: 0.0
+  adam_epsilon: 1e-08
+  max_grad_norm: 1.0
+VAL:
+  DATASET: 'humanml3d'
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  PERSISTENT_WORKERS: true
+TEST:
+  DATASET: 'humanml3d'
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  PERSISTENT_WORKERS: true
+  CHECKPOINTS: 'experiments_control/temporal/motionlcm_humanml/motionlcm_humanml_t.ckpt'
+  # Testing Args
+  REPLICATION_TIMES: 20
+  DIVERSITY_TIMES: 300
+  DO_MM_TEST: false
+DATASET:
+  NAME: 'humanml3d'
+  SMPL_PATH: './deps/smpl'
+  WORD_VERTILIZER_PATH: './deps/glove/'
+  HUMANML3D:
+    FRAME_RATE: 20.0
+    UNIT_LEN: 4
+    ROOT: './datasets/humanml3d'
+    CONTROL_ARGS:
+      CONTROL: true
+      TEMPORAL: true
+      TRAIN_JOINTS: [0, 10, 11, 15, 20, 21]
+      TEST_JOINTS: [0, 10, 11, 15, 20, 21]
+      TRAIN_DENSITY: [25, 25]
+      TEST_DENSITY: 25
+      MEAN_STD_PATH: './datasets/humanml_spatial_norm'
+  SAMPLER:
+    MAX_LEN: 200
+    MIN_LEN: 40
+    MAX_TEXT_LEN: 20
+  PADDING_TO_MAX: false
+  WINDOW_SIZE: null
+METRIC:
+  DIST_SYNC_ON_STEP: true
+  TYPE: ['TM2TMetrics', 'ControlMetrics']
+model:
+  target: ['motion_vae', 'text_encoder', 'denoiser', 'scheduler_lcm', 'traj_encoder', 'noise_optimizer']
+  latent_dim: [16, 32]
+  guidance_scale: 'dynamic'
+  # ControlNet Args
+  is_controlnet: true
+  vaeloss: true
+  vaeloss_type: 'sum'
+  cond_ratio: 1.0
+  control_loss_func: 'l2'
+  use_3d: false
+  lcm_w_min_nax: [5, 15]
+  lcm_num_ddim_timesteps: 10
+  t2m_textencoder:
+    dim_word: 300
+    dim_pos_ohot: 15
+    dim_text_hidden: 512
+    dim_coemb_hidden: 512
+  t2m_motionencoder:
+    dim_move_hidden: 512
+    dim_move_latent: 512
+    dim_motion_hidden: 1024
+    dim_motion_latent: 512
+  bert_path: './deps/distilbert-base-uncased'
+  clip_path: './deps/clip-vit-large-patch14'
+  t5_path: './deps/sentence-t5-large'
+  t2m_path: './deps/t2m/'

configs/motionlcm_t2m.yaml ADDED Viewed

	@@ -0,0 +1,109 @@

+FOLDER: './experiments_t2m'
+TEST_FOLDER: './experiments_t2m_test'
+NAME: 'motionlcm_humanml'
+SEED_VALUE: 1234
+TRAIN:
+  BATCH_SIZE: 128
+  SPLIT: 'train'
+  NUM_WORKERS: 8
+  PERSISTENT_WORKERS: true
+  PRETRAINED: 'experiments_t2m/mld_humanml/mld_humanml.ckpt'
+  validation_steps: -1
+  validation_epochs: 50
+  checkpointing_steps: -1
+  checkpointing_epochs: 50
+  max_train_steps: -1
+  max_train_epochs: 1000
+  learning_rate: 2e-4
+  lr_scheduler: "cosine"
+  lr_warmup_steps: 1000
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay: 0.0
+  adam_epsilon: 1e-08
+  max_grad_norm: 1.0
+  # Latent Consistency Distillation Specific Arguments
+  w_min: 5.0
+  w_max: 15.0
+  num_ddim_timesteps: 10
+  loss_type: 'huber'
+  huber_c: 0.5
+  unet_time_cond_proj_dim: 256
+  ema_decay: 0.95
+VAL:
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  PERSISTENT_WORKERS: true
+TEST:
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  PERSISTENT_WORKERS: true
+  CHECKPOINTS: 'experiments_t2m/motionlcm_humanml/motionlcm_humanml.ckpt'
+  # Testing Args
+  REPLICATION_TIMES: 20
+  MM_NUM_SAMPLES: 100
+  MM_NUM_REPEATS: 30
+  MM_NUM_TIMES: 10
+  DIVERSITY_TIMES: 300
+  DO_MM_TEST: true
+DATASET:
+  NAME: 'humanml3d'
+  SMPL_PATH: './deps/smpl'
+  WORD_VERTILIZER_PATH: './deps/glove/'
+  HUMANML3D:
+    FRAME_RATE: 20.0
+    UNIT_LEN: 4
+    ROOT: './datasets/humanml3d'
+    CONTROL_ARGS:
+      CONTROL: false
+      TEMPORAL: false
+      TRAIN_JOINTS: [0]
+      TEST_JOINTS: [0]
+      TRAIN_DENSITY: 'random'
+      TEST_DENSITY: 100
+      MEAN_STD_PATH: './datasets/humanml_spatial_norm'
+  SAMPLER:
+    MAX_LEN: 200
+    MIN_LEN: 40
+    MAX_TEXT_LEN: 20
+  PADDING_TO_MAX: false
+  WINDOW_SIZE: null
+METRIC:
+  DIST_SYNC_ON_STEP: true
+  TYPE: ['TM2TMetrics']
+model:
+  target: ['motion_vae', 'text_encoder', 'denoiser', 'scheduler_lcm', 'noise_optimizer']
+  latent_dim: [16, 32]
+  guidance_scale: 'dynamic'
+  t2m_textencoder:
+    dim_word: 300
+    dim_pos_ohot: 15
+    dim_text_hidden: 512
+    dim_coemb_hidden: 512
+  t2m_motionencoder:
+    dim_move_hidden: 512
+    dim_move_latent: 512
+    dim_motion_hidden: 1024
+    dim_motion_latent: 512
+  bert_path: './deps/distilbert-base-uncased'
+  clip_path: './deps/clip-vit-large-patch14'
+  t5_path: './deps/sentence-t5-large'
+  t2m_path: './deps/t2m/'

configs/motionlcm_t2m_clt.yaml ADDED Viewed

	@@ -0,0 +1,69 @@

+FOLDER: './experiments_t2m'
+TEST_FOLDER: './experiments_t2m_test'
+NAME: 'motionlcm_humanml'
+SEED_VALUE: 1234
+TEST:
+  BATCH_SIZE: 1
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  PERSISTENT_WORKERS: true
+  CHECKPOINTS: 'experiments_t2m/motionlcm_humanml/motionlcm_humanml.ckpt'
+  # Testing Args
+  REPLICATION_TIMES: 1
+  DIVERSITY_TIMES: 300
+  DO_MM_TEST: false
+  MAX_NUM_SAMPLES: 1024
+DATASET:
+  NAME: 'humanml3d'
+  SMPL_PATH: './deps/smpl'
+  WORD_VERTILIZER_PATH: './deps/glove/'
+  HUMANML3D:
+    FRAME_RATE: 20.0
+    UNIT_LEN: 4
+    ROOT: './datasets/humanml3d'
+    CONTROL_ARGS:
+      CONTROL: true
+      TEMPORAL: false
+      TRAIN_JOINTS: [0]
+      TEST_JOINTS: [0]
+      TRAIN_DENSITY: 'random'
+      TEST_DENSITY: 100
+      MEAN_STD_PATH: './datasets/humanml_spatial_norm'
+  SAMPLER:
+    MAX_LEN: 200
+    MIN_LEN: 40
+    MAX_TEXT_LEN: 20
+  PADDING_TO_MAX: false
+  WINDOW_SIZE: null
+METRIC:
+  DIST_SYNC_ON_STEP: true
+  TYPE: ['TM2TMetrics', 'ControlMetrics']
+model:
+  target: ['motion_vae', 'text_encoder', 'denoiser', 'scheduler_lcm', 'noise_optimizer']
+  latent_dim: [16, 32]
+  guidance_scale: 'dynamic'
+  t2m_textencoder:
+    dim_word: 300
+    dim_pos_ohot: 15
+    dim_text_hidden: 512
+    dim_coemb_hidden: 512
+  t2m_motionencoder:
+    dim_move_hidden: 512
+    dim_move_latent: 512
+    dim_motion_hidden: 1024
+    dim_motion_latent: 512
+  bert_path: './deps/distilbert-base-uncased'
+  clip_path: './deps/clip-vit-large-patch14'
+  t5_path: './deps/sentence-t5-large'
+  t2m_path: './deps/t2m/'

configs/vae.yaml ADDED Viewed

	@@ -0,0 +1,103 @@

+FOLDER: './experiments_recons'
+TEST_FOLDER: './experiments_recons_test'
+NAME: 'vae_humanml'
+SEED_VALUE: 1234
+TRAIN:
+  BATCH_SIZE: 128
+  SPLIT: 'train'
+  NUM_WORKERS: 8
+  PERSISTENT_WORKERS: true
+  PRETRAINED: ''
+  validation_steps: -1
+  validation_epochs: 100
+  checkpointing_steps: -1
+  checkpointing_epochs: 100
+  max_train_steps: -1
+  max_train_epochs: 6000
+  learning_rate: 2e-4
+  lr_scheduler: "cosine"
+  lr_warmup_steps: 1000
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay: 0.0
+  adam_epsilon: 1e-08
+  max_grad_norm: 1.0
+VAL:
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  PERSISTENT_WORKERS: true
+TEST:
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  PERSISTENT_WORKERS: true
+  CHECKPOINTS: 'experiments_recons/vae_humanml/vae_humanml.ckpt'
+  # Testing Args
+  REPLICATION_TIMES: 20
+  DIVERSITY_TIMES: 300
+  DO_MM_TEST: false
+DATASET:
+  NAME: 'humanml3d'
+  SMPL_PATH: './deps/smpl'
+  WORD_VERTILIZER_PATH: './deps/glove/'
+  HUMANML3D:
+    FRAME_RATE: 20.0
+    UNIT_LEN: 4
+    ROOT: './datasets/humanml3d'
+    CONTROL_ARGS:
+      CONTROL: false
+      TEMPORAL: false
+      TRAIN_JOINTS: [0]
+      TEST_JOINTS: [0]
+      TRAIN_DENSITY: 'random'
+      TEST_DESITY: 100
+      MEAN_STD_PATH: './datasets/humanml_spatial_norm'
+  SAMPLER:
+    MAX_LEN: 200
+    MIN_LEN: 40
+    MAX_TEXT_LEN: 20
+  PADDING_TO_MAX: true
+  WINDOW_SIZE: 64
+METRIC:
+  DIST_SYNC_ON_STEP: true
+  TYPE: ['TM2TMetrics', "PosMetrics"]
+model:
+  target: ['motion_vae']
+  latent_dim: [16, 32]
+  # VAE Args
+  rec_feats_ratio: 1.0
+  rec_joints_ratio: 1.0
+  rec_velocity_ratio: 0.0
+  kl_ratio: 1e-4
+  rec_feats_loss: 'l1_smooth'
+  rec_joints_loss: 'l1_smooth'
+  rec_velocity_loss: 'l1_smooth'
+  mask_loss: true
+  t2m_textencoder:
+    dim_word: 300
+    dim_pos_ohot: 15
+    dim_text_hidden: 512
+    dim_coemb_hidden: 512
+  t2m_motionencoder:
+    dim_move_hidden: 512
+    dim_move_latent: 512
+    dim_motion_hidden: 1024
+    dim_motion_latent: 512
+  t2m_path: './deps/t2m/'

configs_v1/modules/denoiser.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+denoiser:
+  target: mld.models.architectures.mld_denoiser.MldDenoiser
+  params:
+    latent_dim: ${model.latent_dim}
+    hidden_dim: null
+    text_dim: 768
+    time_dim: 768
+    ff_size: 1024
+    num_layers: 9
+    num_heads: 4
+    dropout: 0.1
+    normalize_before: false
+    norm_eps: 1e-5
+    activation: 'gelu'
+    norm_post: true
+    activation_post: null
+    flip_sin_to_cos: true
+    freq_shift: 0
+    time_act_fn: 'silu'
+    time_post_act_fn: null
+    position_embedding: 'learned'
+    arch: 'trans_enc'
+    add_mem_pos: true
+    force_pre_post_proj: false
+    text_act_fn: 'relu'
+    zero_init_cond: true
+    controlnet_embed_dim: 256
+    controlnet_act_fn: null

configs_v1/modules/motion_vae.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+motion_vae:
+  target: mld.models.architectures.mld_vae.MldVae
+  params:
+    nfeats: ${DATASET.NFEATS}
+    latent_dim: ${model.latent_dim}
+    hidden_dim: null
+    force_pre_post_proj: false
+    ff_size: 1024
+    num_layers: 9
+    num_heads: 4
+    dropout: 0.1
+    arch: 'encoder_decoder'
+    normalize_before: false
+    norm_eps: 1e-5
+    activation: 'gelu'
+    norm_post: true
+    activation_post: null
+    position_embedding: 'learned'

configs_v1/modules/scheduler_lcm.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+scheduler:
+  target: diffusers.LCMScheduler
+  num_inference_steps: 1
+  params:
+    num_train_timesteps: 1000
+    beta_start: 0.00085
+    beta_end: 0.012
+    beta_schedule: 'scaled_linear'
+    clip_sample: false
+    set_alpha_to_one: false
+    original_inference_steps: 50

configs_v1/modules/text_encoder.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+text_encoder:
+  target: mld.models.architectures.mld_clip.MldTextEncoder
+  params:
+    last_hidden_state: false
+    modelpath: ${model.t5_path}

configs_v1/modules/traj_encoder.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+traj_encoder:
+  target: mld.models.architectures.mld_traj_encoder.MldTrajEncoder
+  params:
+    nfeats: ${DATASET.NJOINTS}
+    latent_dim: ${model.latent_dim}
+    hidden_dim: null
+    force_post_proj: false
+    ff_size: 1024
+    num_layers: 9
+    num_heads: 4
+    dropout: 0.1
+    normalize_before: false
+    norm_eps: 1e-5
+    activation: 'gelu'
+    norm_post: true
+    activation_post: null
+    position_embedding: 'learned'

configs_v1/motionlcm_control_t.yaml ADDED Viewed

	@@ -0,0 +1,114 @@

+FOLDER: './experiments_control/temporal'
+TEST_FOLDER: './experiments_control_test/temporal'
+NAME: 'motionlcm_humanml'
+SEED_VALUE: 1234
+TRAIN:
+  DATASET: 'humanml3d'
+  BATCH_SIZE: 128
+  SPLIT: 'train'
+  NUM_WORKERS: 8
+  PERSISTENT_WORKERS: true
+  PRETRAINED: 'experiments_t2m/motionlcm_humanml/motionlcm_humanml_v1.ckpt'
+  validation_steps: -1
+  validation_epochs: 50
+  checkpointing_steps: -1
+  checkpointing_epochs: 50
+  max_train_steps: -1
+  max_train_epochs: 1000
+  learning_rate: 1e-4
+  learning_rate_spatial: 1e-4
+  lr_scheduler: "cosine"
+  lr_warmup_steps: 1000
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay: 0.0
+  adam_epsilon: 1e-08
+  max_grad_norm: 1.0
+VAL:
+  DATASET: 'humanml3d'
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  PERSISTENT_WORKERS: true
+TEST:
+  DATASET: 'humanml3d'
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  PERSISTENT_WORKERS: true
+  CHECKPOINTS: 'experiments_control/temporal/motionlcm_humanml/motionlcm_humanml_t_v1.ckpt'
+  # Testing Args
+  REPLICATION_TIMES: 20
+  MM_NUM_SAMPLES: 100
+  MM_NUM_REPEATS: 30
+  MM_NUM_TIMES: 10
+  DIVERSITY_TIMES: 300
+  DO_MM_TEST: false
+DATASET:
+  NAME: 'humanml3d'
+  SMPL_PATH: './deps/smpl'
+  WORD_VERTILIZER_PATH: './deps/glove/'
+  HUMANML3D:
+    FRAME_RATE: 20.0
+    UNIT_LEN: 4
+    ROOT: './datasets/humanml3d'
+    CONTROL_ARGS:
+      CONTROL: true
+      TEMPORAL: true
+      TRAIN_JOINTS: [0, 10, 11, 15, 20, 21]
+      TEST_JOINTS: [0, 10, 11, 15, 20, 21]
+      TRAIN_DENSITY: [25, 25]
+      TEST_DENSITY: 25
+      MEAN_STD_PATH: './datasets/humanml_spatial_norm'
+  SAMPLER:
+    MAX_LEN: 200
+    MIN_LEN: 40
+    MAX_TEXT_LEN: 20
+  PADDING_TO_MAX: false
+  WINDOW_SIZE: null
+METRIC:
+  DIST_SYNC_ON_STEP: true
+  TYPE: ['TM2TMetrics', 'ControlMetrics']
+model:
+  target: ['motion_vae', 'text_encoder', 'denoiser', 'scheduler_lcm', 'traj_encoder']
+  latent_dim: [1, 256]
+  guidance_scale: 7.5
+  # ControlNet Args
+  is_controlnet: true
+  vaeloss: true
+  vaeloss_type: 'sum'
+  cond_ratio: 1.0
+  control_loss_func: 'l2'
+  use_3d: false
+  lcm_w_min_nax: null
+  lcm_num_ddim_timesteps: null
+  t2m_textencoder:
+    dim_word: 300
+    dim_pos_ohot: 15
+    dim_text_hidden: 512
+    dim_coemb_hidden: 512
+  t2m_motionencoder:
+    dim_move_hidden: 512
+    dim_move_latent: 512
+    dim_motion_hidden: 1024
+    dim_motion_latent: 512
+  bert_path: './deps/distilbert-base-uncased'
+  clip_path: './deps/clip-vit-large-patch14'
+  t5_path: './deps/sentence-t5-large'
+  t2m_path: './deps/t2m/'

configs_v1/motionlcm_t2m.yaml ADDED Viewed

	@@ -0,0 +1,109 @@

+FOLDER: './experiments_t2m'
+TEST_FOLDER: './experiments_t2m_test'
+NAME: 'motionlcm_humanml'
+SEED_VALUE: 1234
+TRAIN:
+  BATCH_SIZE: 256
+  SPLIT: 'train'
+  NUM_WORKERS: 8
+  PERSISTENT_WORKERS: true
+  PRETRAINED: 'experiments_t2m/mld_humanml/mld_humanml_v1.ckpt'
+  validation_steps: -1
+  validation_epochs: 50
+  checkpointing_steps: -1
+  checkpointing_epochs: 50
+  max_train_steps: -1
+  max_train_epochs: 1000
+  learning_rate: 2e-4
+  lr_scheduler: "cosine"
+  lr_warmup_steps: 1000
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay: 0.0
+  adam_epsilon: 1e-08
+  max_grad_norm: 1.0
+  # Latent Consistency Distillation Specific Arguments
+  w_min: 5.0
+  w_max: 15.0
+  num_ddim_timesteps: 50
+  loss_type: 'huber'
+  huber_c: 0.001
+  unet_time_cond_proj_dim: 256
+  ema_decay: 0.95
+VAL:
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  PERSISTENT_WORKERS: true
+TEST:
+  BATCH_SIZE: 32
+  SPLIT: 'test'
+  NUM_WORKERS: 12
+  PERSISTENT_WORKERS: true
+  CHECKPOINTS: 'experiments_t2m/motionlcm_humanml/motionlcm_humanml_v1.ckpt'
+  # Testing Args
+  REPLICATION_TIMES: 20
+  MM_NUM_SAMPLES: 100
+  MM_NUM_REPEATS: 30
+  MM_NUM_TIMES: 10
+  DIVERSITY_TIMES: 300
+  DO_MM_TEST: true
+DATASET:
+  NAME: 'humanml3d'
+  SMPL_PATH: './deps/smpl'
+  WORD_VERTILIZER_PATH: './deps/glove/'
+  HUMANML3D:
+    FRAME_RATE: 20.0
+    UNIT_LEN: 4
+    ROOT: './datasets/humanml3d'
+    CONTROL_ARGS:
+      CONTROL: false
+      TEMPORAL: false
+      TRAIN_JOINTS: [0]
+      TEST_JOINTS: [0]
+      TRAIN_DENSITY: 'random'
+      TEST_DENSITY: 100
+      MEAN_STD_PATH: './datasets/humanml_spatial_norm'
+  SAMPLER:
+    MAX_LEN: 200
+    MIN_LEN: 40
+    MAX_TEXT_LEN: 20
+  PADDING_TO_MAX: false
+  WINDOW_SIZE: null
+METRIC:
+  DIST_SYNC_ON_STEP: true
+  TYPE: ['TM2TMetrics']
+model:
+  target: ['motion_vae', 'text_encoder', 'denoiser', 'scheduler_lcm']
+  latent_dim: [1, 256]
+  guidance_scale: 7.5
+  t2m_textencoder:
+    dim_word: 300
+    dim_pos_ohot: 15
+    dim_text_hidden: 512
+    dim_coemb_hidden: 512
+  t2m_motionencoder:
+    dim_move_hidden: 512
+    dim_move_latent: 512
+    dim_motion_hidden: 1024
+    dim_motion_latent: 512
+  bert_path: './deps/distilbert-base-uncased'
+  clip_path: './deps/clip-vit-large-patch14'
+  t5_path: './deps/sentence-t5-large'
+  t2m_path: './deps/t2m/'

demo.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import os
+import pickle
+import sys
+import datetime
+import logging
+import os.path as osp
+from omegaconf import OmegaConf
+import torch
+from mld.config import parse_args
+from mld.data.get_data import get_dataset
+from mld.models.modeltype.mld import MLD
+from mld.models.modeltype.vae import VAE
+from mld.utils.utils import set_seed, move_batch_to_device
+from mld.data.humanml.utils.plot_script import plot_3d_motion
+from mld.utils.temos_utils import remove_padding
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+def load_example_hint_input(text_path: str) -> tuple:
+    with open(text_path, "r") as f:
+        lines = f.readlines()
+    n_frames, control_type_ids, control_hint_ids = [], [], []
+    for line in lines:
+        s = line.strip()
+        n_frame, control_type_id, control_hint_id = s.split(' ')
+        n_frames.append(int(n_frame))
+        control_type_ids.append(int(control_type_id))
+        control_hint_ids.append(int(control_hint_id))
+    return n_frames, control_type_ids, control_hint_ids
+def load_example_input(text_path: str) -> tuple:
+    with open(text_path, "r") as f:
+        lines = f.readlines()
+    texts, lens = [], []
+    for line in lines:
+        s = line.strip()
+        s_l = s.split(" ")[0]
+        s_t = s[(len(s_l) + 1):]
+        lens.append(int(s_l))
+        texts.append(s_t)
+    return texts, lens
+def main():
+    cfg = parse_args()
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    set_seed(cfg.SEED_VALUE)
+    name_time_str = osp.join(cfg.NAME, "demo_" + datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S"))
+    cfg.output_dir = osp.join(cfg.TEST_FOLDER, name_time_str)
+    vis_dir = osp.join(cfg.output_dir, 'samples')
+    os.makedirs(cfg.output_dir, exist_ok=False)
+    os.makedirs(vis_dir, exist_ok=False)
+    steam_handler = logging.StreamHandler(sys.stdout)
+    file_handler = logging.FileHandler(osp.join(cfg.output_dir, 'output.log'))
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+                        datefmt="%m/%d/%Y %H:%M:%S",
+                        handlers=[steam_handler, file_handler])
+    logger = logging.getLogger(__name__)
+    OmegaConf.save(cfg, osp.join(cfg.output_dir, 'config.yaml'))
+    state_dict = torch.load(cfg.TEST.CHECKPOINTS, map_location="cpu")["state_dict"]
+    logger.info("Loading checkpoints from {}".format(cfg.TEST.CHECKPOINTS))
+    # Step 1: Check if the checkpoint is VAE-based.
+    is_vae = False
+    vae_key = 'vae.skel_embedding.weight'
+    if vae_key in state_dict:
+        is_vae = True
+    logger.info(f'Is VAE: {is_vae}')
+    # Step 2: Check if the checkpoint is MLD-based.
+    is_mld = False
+    mld_key = 'denoiser.time_embedding.linear_1.weight'
+    if mld_key in state_dict:
+        is_mld = True
+    logger.info(f'Is MLD: {is_mld}')
+    # Step 3: Check if the checkpoint is LCM-based.
+    is_lcm = False
+    lcm_key = 'denoiser.time_embedding.cond_proj.weight'  # unique key for CFG
+    if lcm_key in state_dict:
+        is_lcm = True
+        time_cond_proj_dim = state_dict[lcm_key].shape[1]
+        cfg.model.denoiser.params.time_cond_proj_dim = time_cond_proj_dim
+    logger.info(f'Is LCM: {is_lcm}')
+    # Step 4: Check if the checkpoint is Controlnet-based.
+    cn_key = "controlnet.controlnet_cond_embedding.0.weight"
+    is_controlnet = True if cn_key in state_dict else False
+    cfg.model.is_controlnet = is_controlnet
+    logger.info(f'Is Controlnet: {is_controlnet}')
+    if is_mld or is_lcm or is_controlnet:
+        target_model_class = MLD
+    else:
+        target_model_class = VAE
+    if cfg.optimize:
+        assert cfg.model.get('noise_optimizer') is not None
+        cfg.model.noise_optimizer.params.optimize = True
+        logger.info('Optimization enabled. Set the batch size to 1.')
+        logger.info(f'Original batch size: {cfg.TEST.BATCH_SIZE}')
+        cfg.TEST.BATCH_SIZE = 1
+    dataset = get_dataset(cfg)
+    model = target_model_class(cfg, dataset)
+    model.to(device)
+    model.eval()
+    model.requires_grad_(False)
+    logger.info(model.load_state_dict(state_dict))
+    FPS = eval(f"cfg.DATASET.{cfg.DATASET.NAME.upper()}.FRAME_RATE")
+    if cfg.example is not None and not is_controlnet:
+        text, length = load_example_input(cfg.example)
+        for t, l in zip(text, length):
+            logger.info(f"{l}: {t}")
+        batch = {"length": length, "text": text}
+        for rep_i in range(cfg.replication):
+            with torch.no_grad():
+                joints = model(batch)[0]
+            num_samples = len(joints)
+            for i in range(num_samples):
+                res = dict()
+                pkl_path = osp.join(vis_dir, f"sample_id_{i}_length_{length[i]}_rep_{rep_i}.pkl")
+                res['joints'] = joints[i].detach().cpu().numpy()
+                res['text'] = text[i]
+                res['length'] = length[i]
+                res['hint'] = None
+                with open(pkl_path, 'wb') as f:
+                    pickle.dump(res, f)
+                logger.info(f"Motions are generated here:\n{pkl_path}")
+                if not cfg.no_plot:
+                    plot_3d_motion(pkl_path.replace('.pkl', '.mp4'), joints[i].detach().cpu().numpy(), text[i], fps=FPS)
+    else:
+        test_dataloader = dataset.test_dataloader()
+        for rep_i in range(cfg.replication):
+            for batch_id, batch in enumerate(test_dataloader):
+                batch = move_batch_to_device(batch, device)
+                with torch.no_grad():
+                    joints, joints_ref = model(batch)
+                num_samples = len(joints)
+                text = batch['text']
+                length = batch['length']
+                if 'hint' in batch:
+                    hint, hint_mask = batch['hint'], batch['hint_mask']
+                    hint = dataset.denorm_spatial(hint) * hint_mask
+                    hint = remove_padding(hint, lengths=length)
+                else:
+                    hint = None
+                for i in range(num_samples):
+                    res = dict()
+                    pkl_path = osp.join(vis_dir, f"batch_id_{batch_id}_sample_id_{i}_length_{length[i]}_rep_{rep_i}.pkl")
+                    res['joints'] = joints[i].detach().cpu().numpy()
+                    res['text'] = text[i]
+                    res['length'] = length[i]
+                    res['hint'] = hint[i].detach().cpu().numpy() if hint is not None else None
+                    with open(pkl_path, 'wb') as f:
+                        pickle.dump(res, f)
+                    logger.info(f"Motions are generated here:\n{pkl_path}")
+                    if not cfg.no_plot:
+                        plot_3d_motion(pkl_path.replace('.pkl', '.mp4'), joints[i].detach().cpu().numpy(),
+                                       text[i], fps=FPS, hint=hint[i].detach().cpu().numpy() if hint is not None else None)
+                    if rep_i == 0:
+                        res['joints'] = joints_ref[i].detach().cpu().numpy()
+                        with open(pkl_path.replace('.pkl', '_ref.pkl'), 'wb') as f:
+                            pickle.dump(res, f)
+                        logger.info(f"Motions are generated here:\n{pkl_path.replace('.pkl', '_ref.pkl')}")
+                        if not cfg.no_plot:
+                            plot_3d_motion(pkl_path.replace('.pkl', '_ref.mp4'), joints_ref[i].detach().cpu().numpy(),
+                                           text[i], fps=FPS, hint=hint[i].detach().cpu().numpy() if hint is not None else None)
+if __name__ == "__main__":
+    main()

fit.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# borrow from optimization https://github.com/wangsen1312/joints2smpl
+import os
+import argparse
+import pickle
+import h5py
+import natsort
+import smplx
+import torch
+from mld.transforms.joints2rots import config
+from mld.transforms.joints2rots.smplify import SMPLify3D
+parser = argparse.ArgumentParser()
+parser.add_argument("--pkl", type=str, default=None, help="pkl motion file")
+parser.add_argument("--dir", type=str, default=None, help="pkl motion folder")
+parser.add_argument("--num_smplify_iters", type=int, default=150, help="num of smplify iters")
+parser.add_argument("--cuda", type=bool, default=True, help="enables cuda")
+parser.add_argument("--gpu_ids", type=int, default=0, help="choose gpu ids")
+parser.add_argument("--num_joints", type=int, default=22, help="joint number")
+parser.add_argument("--joint_category", type=str, default="AMASS", help="use correspondence")
+parser.add_argument("--fix_foot", type=str, default="False", help="fix foot or not")
+opt = parser.parse_args()
+print(opt)
+if opt.pkl:
+    paths = [opt.pkl]
+elif opt.dir:
+    paths = []
+    file_list = natsort.natsorted(os.listdir(opt.dir))
+    for item in file_list:
+        if item.endswith('.pkl') and not item.endswith("_mesh.pkl"):
+            paths.append(os.path.join(opt.dir, item))
+else:
+    raise ValueError(f'{opt.pkl} and {opt.dir} are both None!')
+for path in paths:
+    # load joints
+    if os.path.exists(path.replace('.pkl', '_mesh.pkl')):
+        print(f"{path} is rendered! skip!")
+        continue
+    with open(path, 'rb') as f:
+        data = pickle.load(f)
+    joints = data['joints']
+    # load predefined something
+    device = torch.device("cuda:" + str(opt.gpu_ids) if opt.cuda else "cpu")
+    print(config.SMPL_MODEL_DIR)
+    smplxmodel = smplx.create(
+        config.SMPL_MODEL_DIR,
+        model_type="smpl",
+        gender="neutral",
+        ext="pkl",
+        batch_size=joints.shape[0],
+    ).to(device)
+    # load the mean pose as original
+    smpl_mean_file = config.SMPL_MEAN_FILE
+    file = h5py.File(smpl_mean_file, "r")
+    init_mean_pose = (
+        torch.from_numpy(file["pose"][:])
+        .unsqueeze(0).repeat(joints.shape[0], 1)
+        .float()
+        .to(device)
+    )
+    init_mean_shape = (
+        torch.from_numpy(file["shape"][:])
+        .unsqueeze(0).repeat(joints.shape[0], 1)
+        .float()
+        .to(device)
+    )
+    cam_trans_zero = torch.Tensor([0.0, 0.0, 0.0]).unsqueeze(0).to(device)
+    # initialize SMPLify
+    smplify = SMPLify3D(
+        smplxmodel=smplxmodel,
+        batch_size=joints.shape[0],
+        joints_category=opt.joint_category,
+        num_iters=opt.num_smplify_iters,
+        device=device,
+    )
+    print("initialize SMPLify3D done!")
+    print("Start SMPLify!")
+    keypoints_3d = torch.Tensor(joints).to(device).float()
+    if opt.joint_category == "AMASS":
+        confidence_input = torch.ones(opt.num_joints)
+        # make sure the foot and ankle
+        if opt.fix_foot:
+            confidence_input[7] = 1.5
+            confidence_input[8] = 1.5
+            confidence_input[10] = 1.5
+            confidence_input[11] = 1.5
+    else:
+        print("Such category not settle down!")
+    # ----- from initial to fitting -------
+    (
+        new_opt_vertices,
+        new_opt_joints,
+        new_opt_pose,
+        new_opt_betas,
+        new_opt_cam_t,
+        new_opt_joint_loss,
+    ) = smplify(
+        init_mean_pose.detach(),
+        init_mean_shape.detach(),
+        cam_trans_zero.detach(),
+        keypoints_3d,
+        conf_3d=confidence_input.to(device)
+    )
+    # fix shape
+    betas = torch.zeros_like(new_opt_betas)
+    root = keypoints_3d[:, 0, :]
+    output = smplxmodel(
+        betas=betas,
+        global_orient=new_opt_pose[:, :3],
+        body_pose=new_opt_pose[:, 3:],
+        transl=root,
+        return_verts=True
+    )
+    vertices = output.vertices.detach().cpu().numpy()
+    floor_height = vertices[..., 1].min()
+    vertices[..., 1] -= floor_height
+    data['vertices'] = vertices
+    save_file = path.replace('.pkl', '_mesh.pkl')
+    with open(save_file, 'wb') as f:
+        pickle.dump(data, f)
+    print(f'vertices saved in {save_file}')

mld/__init__.py ADDED Viewed

File without changes

mld/config.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os
+import importlib
+from typing import Type, TypeVar
+from argparse import ArgumentParser
+from omegaconf import OmegaConf, DictConfig
+def get_module_config(cfg_model: DictConfig, paths: list[str], cfg_root: str) -> DictConfig:
+    files = [os.path.join(cfg_root, 'modules', p+'.yaml') for p in paths]
+    for file in files:
+        assert os.path.exists(file), f'{file} is not exists.'
+        with open(file, 'r') as f:
+            cfg_model.merge_with(OmegaConf.load(f))
+    return cfg_model
+def get_obj_from_str(string: str, reload: bool = False) -> Type:
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def instantiate_from_config(config: DictConfig) -> TypeVar:
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def parse_args() -> DictConfig:
+    parser = ArgumentParser()
+    parser.add_argument("--cfg", type=str, required=True, help="The main config file")
+    parser.add_argument('--example', type=str, required=False, help="The input texts and lengths with txt format")
+    parser.add_argument('--example_hint', type=str, required=False, help="The input hint ids and lengths with txt format")
+    parser.add_argument('--no-plot', action="store_true", required=False, help="Whether to plot the skeleton-based motion")
+    parser.add_argument('--replication', type=int, default=1, help="The number of replications of sampling")
+    parser.add_argument('--vis', type=str, default="tb", choices=['tb', 'swanlab'], help="The visualization backends: tensorboard or swanlab")
+    parser.add_argument('--optimize', action='store_true', help="Enable optimization for motion control")
+    args = parser.parse_args()
+    cfg = OmegaConf.load(args.cfg)
+    cfg_root = os.path.dirname(args.cfg)
+    cfg_model = get_module_config(cfg.model, cfg.model.target, cfg_root)
+    cfg = OmegaConf.merge(cfg, cfg_model)
+    cfg.example = args.example
+    cfg.example_hint = args.example_hint
+    cfg.no_plot = args.no_plot
+    cfg.replication = args.replication
+    cfg.vis = args.vis
+    cfg.optimize = args.optimize
+    return cfg

mld/data/__init__.py ADDED Viewed

File without changes

mld/data/base.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import copy
+from os.path import join as pjoin
+from typing import Any, Callable
+from torch.utils.data import DataLoader
+class BaseDataModule:
+    def __init__(self, collate_fn: Callable) -> None:
+        super(BaseDataModule, self).__init__()
+        self.collate_fn = collate_fn
+        self.is_mm = False
+    def get_sample_set(self, overrides: dict) -> Any:
+        sample_params = copy.deepcopy(self.hparams)
+        sample_params.update(overrides)
+        split_file = pjoin(
+            eval(f"self.cfg.DATASET.{self.name.upper()}.ROOT"),
+            self.cfg.TEST.SPLIT + ".txt"
+        )
+        return self.Dataset(split_file=split_file, **sample_params)
+    def __getattr__(self, item: str) -> Any:
+        if item.endswith("_dataset") and not item.startswith("_"):
+            subset = item[:-len("_dataset")].upper()
+            item_c = "_" + item
+            if item_c not in self.__dict__:
+                split_file = pjoin(
+                    eval(f"self.cfg.DATASET.{self.name.upper()}.ROOT"),
+                    eval(f"self.cfg.{subset}.SPLIT") + ".txt"
+                )
+                self.__dict__[item_c] = self.Dataset(split_file=split_file, **self.hparams)
+            return getattr(self, item_c)
+        classname = self.__class__.__name__
+        raise AttributeError(f"'{classname}' object has no attribute '{item}'")
+    def get_dataloader_options(self, stage: str) -> dict:
+        stage_args = eval(f"self.cfg.{stage.upper()}")
+        dataloader_options = {
+            "batch_size": stage_args.BATCH_SIZE,
+            "num_workers": stage_args.NUM_WORKERS,
+            "collate_fn": self.collate_fn,
+            "persistent_workers": stage_args.PERSISTENT_WORKERS,
+        }
+        return dataloader_options
+    def train_dataloader(self) -> DataLoader:
+        dataloader_options = self.get_dataloader_options('TRAIN')
+        return DataLoader(self.train_dataset, shuffle=True, **dataloader_options)
+    def val_dataloader(self) -> DataLoader:
+        dataloader_options = self.get_dataloader_options('VAL')
+        return DataLoader(self.val_dataset, shuffle=False, **dataloader_options)
+    def test_dataloader(self) -> DataLoader:
+        dataloader_options = self.get_dataloader_options('TEST')
+        dataloader_options["batch_size"] = 1 if self.is_mm else self.cfg.TEST.BATCH_SIZE
+        return DataLoader(self.test_dataset, shuffle=False, **dataloader_options)

mld/data/data.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import copy
+from typing import Callable, Optional
+import numpy as np
+from omegaconf import DictConfig
+import torch
+from .base import BaseDataModule
+from .humanml.dataset import Text2MotionDataset, MotionDataset
+from .humanml.scripts.motion_process import recover_from_ric
+# (nfeats, njoints)
+dataset_map = {'humanml3d': (263, 22), 'kit': (251, 21)}
+class DataModule(BaseDataModule):
+    def __init__(self,
+                 name: str,
+                 cfg: DictConfig,
+                 motion_only: bool,
+                 collate_fn: Optional[Callable] = None,
+                 **kwargs) -> None:
+        super().__init__(collate_fn=collate_fn)
+        self.cfg = cfg
+        self.name = name
+        self.nfeats, self.njoints = dataset_map[name]
+        self.hparams = copy.deepcopy({**kwargs, 'njoints': self.njoints})
+        self.Dataset = MotionDataset if motion_only else Text2MotionDataset
+        sample_overrides = {"tiny": True, "progress_bar": False}
+        self._sample_set = self.get_sample_set(overrides=sample_overrides)
+    def denorm_spatial(self, hint: torch.Tensor) -> torch.Tensor:
+        raw_mean = torch.tensor(self._sample_set.raw_mean).to(hint)
+        raw_std = torch.tensor(self._sample_set.raw_std).to(hint)
+        hint = hint * raw_std + raw_mean
+        return hint
+    def norm_spatial(self, hint: torch.Tensor) -> torch.Tensor:
+        raw_mean = torch.tensor(self._sample_set.raw_mean).to(hint)
+        raw_std = torch.tensor(self._sample_set.raw_std).to(hint)
+        hint = (hint - raw_mean) / raw_std
+        return hint
+    def feats2joints(self, features: torch.Tensor) -> torch.Tensor:
+        mean = torch.tensor(self.hparams['mean']).to(features)
+        std = torch.tensor(self.hparams['std']).to(features)
+        features = features * std + mean
+        return recover_from_ric(features, self.njoints)
+    def renorm4t2m(self, features: torch.Tensor) -> torch.Tensor:
+        # renorm to t2m norms for using t2m evaluators
+        ori_mean = torch.tensor(self.hparams['mean']).to(features)
+        ori_std = torch.tensor(self.hparams['std']).to(features)
+        eval_mean = torch.tensor(self.hparams['mean_eval']).to(features)
+        eval_std = torch.tensor(self.hparams['std_eval']).to(features)
+        features = features * ori_std + ori_mean
+        features = (features - eval_mean) / eval_std
+        return features
+    def mm_mode(self, mm_on: bool = True) -> None:
+        if mm_on:
+            self.is_mm = True
+            self.name_list = self.test_dataset.name_list
+            self.mm_list = np.random.choice(self.name_list,
+                                            self.cfg.TEST.MM_NUM_SAMPLES,
+                                            replace=False)
+            self.test_dataset.name_list = self.mm_list
+        else:
+            self.is_mm = False
+            self.test_dataset.name_list = self.name_list

mld/data/get_data.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from typing import Optional
+from os.path import join as pjoin
+import numpy as np
+from omegaconf import DictConfig
+from .data import DataModule
+from .base import BaseDataModule
+from .utils import mld_collate, mld_collate_motion_only
+from .humanml.utils.word_vectorizer import WordVectorizer
+def get_mean_std(phase: str, cfg: DictConfig, dataset_name: str) -> tuple[np.ndarray, np.ndarray]:
+    name = "t2m" if dataset_name == "humanml3d" else dataset_name
+    assert name in ["t2m", "kit"]
+    if phase in ["val"]:
+        if name == 't2m':
+            data_root = pjoin(cfg.model.t2m_path, name, "Comp_v6_KLD01", "meta")
+        elif name == 'kit':
+            data_root = pjoin(cfg.model.t2m_path, name, "Comp_v6_KLD005", "meta")
+        else:
+            raise ValueError("Only support t2m and kit")
+        mean = np.load(pjoin(data_root, "mean.npy"))
+        std = np.load(pjoin(data_root, "std.npy"))
+    else:
+        data_root = eval(f"cfg.DATASET.{dataset_name.upper()}.ROOT")
+        mean = np.load(pjoin(data_root, "Mean.npy"))
+        std = np.load(pjoin(data_root, "Std.npy"))
+    return mean, std
+def get_WordVectorizer(cfg: DictConfig, dataset_name: str) -> Optional[WordVectorizer]:
+    if dataset_name.lower() in ["humanml3d", "kit"]:
+        return WordVectorizer(cfg.DATASET.WORD_VERTILIZER_PATH, "our_vab")
+    else:
+        raise ValueError("Only support WordVectorizer for HumanML3D and KIT")
+dataset_module_map = {"humanml3d": DataModule, "kit": DataModule}
+motion_subdir = {"humanml3d": "new_joint_vecs", "kit": "new_joint_vecs"}
+def get_dataset(cfg: DictConfig, motion_only: bool = False) -> BaseDataModule:
+    dataset_name = cfg.DATASET.NAME
+    if dataset_name.lower() in ["humanml3d", "kit"]:
+        data_root = eval(f"cfg.DATASET.{dataset_name.upper()}.ROOT")
+        mean, std = get_mean_std('train', cfg, dataset_name)
+        mean_eval, std_eval = get_mean_std("val", cfg, dataset_name)
+        wordVectorizer = None if motion_only else get_WordVectorizer(cfg, dataset_name)
+        collate_fn = mld_collate_motion_only if motion_only else mld_collate
+        dataset = dataset_module_map[dataset_name.lower()](
+            name=dataset_name.lower(),
+            cfg=cfg,
+            motion_only=motion_only,
+            collate_fn=collate_fn,
+            mean=mean,
+            std=std,
+            mean_eval=mean_eval,
+            std_eval=std_eval,
+            w_vectorizer=wordVectorizer,
+            text_dir=pjoin(data_root, "texts"),
+            motion_dir=pjoin(data_root, motion_subdir[dataset_name]),
+            max_motion_length=cfg.DATASET.SAMPLER.MAX_LEN,
+            min_motion_length=cfg.DATASET.SAMPLER.MIN_LEN,
+            max_text_len=cfg.DATASET.SAMPLER.MAX_TEXT_LEN,
+            unit_length=eval(f"cfg.DATASET.{dataset_name.upper()}.UNIT_LEN"),
+            fps=eval(f"cfg.DATASET.{dataset_name.upper()}.FRAME_RATE"),
+            padding_to_max=cfg.DATASET.PADDING_TO_MAX,
+            window_size=cfg.DATASET.WINDOW_SIZE,
+            control_args=eval(f"cfg.DATASET.{dataset_name.upper()}.CONTROL_ARGS"))
+        cfg.DATASET.NFEATS = dataset.nfeats
+        cfg.DATASET.NJOINTS = dataset.njoints
+        return dataset
+    elif dataset_name.lower() in ["humanact12", 'uestc', "amass"]:
+        raise NotImplementedError

mld/data/humanml/__init__.py ADDED Viewed

File without changes

mld/data/humanml/common/quaternion.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+def qinv(q: torch.Tensor) -> torch.Tensor:
+    assert q.shape[-1] == 4, 'q must be a tensor of shape (*, 4)'
+    mask = torch.ones_like(q)
+    mask[..., 1:] = -mask[..., 1:]
+    return q * mask
+def qrot(q: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    """
+    Rotate vector(s) v about the rotation described by quaternion(s) q.
+    Expects a tensor of shape (*, 4) for q and a tensor of shape (*, 3) for v,
+    where * denotes any number of dimensions.
+    Returns a tensor of shape (*, 3).
+    """
+    assert q.shape[-1] == 4
+    assert v.shape[-1] == 3
+    assert q.shape[:-1] == v.shape[:-1]
+    original_shape = list(v.shape)
+    q = q.contiguous().view(-1, 4)
+    v = v.contiguous().view(-1, 3)
+    qvec = q[:, 1:]
+    uv = torch.cross(qvec, v, dim=1)
+    uuv = torch.cross(qvec, uv, dim=1)
+    return (v + 2 * (q[:, :1] * uv + uuv)).view(original_shape)

mld/data/humanml/dataset.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import os
+import random
+import logging
+import codecs as cs
+from os.path import join as pjoin
+import numpy as np
+from rich.progress import track
+import torch
+from torch.utils.data import Dataset
+from .scripts.motion_process import recover_from_ric
+from .utils.word_vectorizer import WordVectorizer
+logger = logging.getLogger(__name__)
+class MotionDataset(Dataset):
+    def __init__(self, mean: np.ndarray, std: np.ndarray,
+                 split_file: str, motion_dir: str, window_size: int,
+                 tiny: bool = False, progress_bar: bool = True, **kwargs) -> None:
+        self.data = []
+        self.lengths = []
+        id_list = []
+        with cs.open(split_file, "r") as f:
+            for line in f.readlines():
+                id_list.append(line.strip())
+        maxdata = 10 if tiny else 1e10
+        if progress_bar:
+            enumerator = enumerate(
+                track(
+                    id_list,
+                    f"Loading HumanML3D {split_file.split('/')[-1].split('.')[0]}",
+                ))
+        else:
+            enumerator = enumerate(id_list)
+        count = 0
+        for i, name in enumerator:
+            if count > maxdata:
+                break
+            try:
+                motion = np.load(pjoin(motion_dir, name + '.npy'))
+                if motion.shape[0] < window_size:
+                    continue
+                self.lengths.append(motion.shape[0] - window_size)
+                self.data.append(motion)
+            except Exception as e:
+                print(e)
+                pass
+        self.cumsum = np.cumsum([0] + self.lengths)
+        if not tiny:
+            logger.info("Total number of motions {}, snippets {}".format(len(self.data), self.cumsum[-1]))
+        self.mean = mean
+        self.std = std
+        self.window_size = window_size
+    def __len__(self) -> int:
+        return self.cumsum[-1]
+    def __getitem__(self, item: int) -> tuple:
+        if item != 0:
+            motion_id = np.searchsorted(self.cumsum, item) - 1
+            idx = item - self.cumsum[motion_id] - 1
+        else:
+            motion_id = 0
+            idx = 0
+        motion = self.data[motion_id][idx:idx + self.window_size]
+        "Z Normalization"
+        motion = (motion - self.mean) / self.std
+        return motion, self.window_size
+class Text2MotionDataset(Dataset):
+    def __init__(
+        self,
+        mean: np.ndarray,
+        std: np.ndarray,
+        split_file: str,
+        w_vectorizer: WordVectorizer,
+        max_motion_length: int,
+        min_motion_length: int,
+        max_text_len: int,
+        unit_length: int,
+        motion_dir: str,
+        text_dir: str,
+        fps: int,
+        padding_to_max: bool,
+        njoints: int,
+        tiny: bool = False,
+        progress_bar: bool = True,
+        **kwargs,
+    ) -> None:
+        self.w_vectorizer = w_vectorizer
+        self.max_motion_length = max_motion_length
+        self.min_motion_length = min_motion_length
+        self.max_text_len = max_text_len
+        self.unit_length = unit_length
+        self.padding_to_max = padding_to_max
+        self.njoints = njoints
+        data_dict = {}
+        id_list = []
+        with cs.open(split_file, "r") as f:
+            for line in f.readlines():
+                id_list.append(line.strip())
+        self.id_list = id_list
+        maxdata = 10 if tiny else 1e10
+        if progress_bar:
+            enumerator = enumerate(
+                track(
+                    id_list,
+                    f"Loading HumanML3D {split_file.split('/')[-1].split('.')[0]}",
+                ))
+        else:
+            enumerator = enumerate(id_list)
+        count = 0
+        bad_count = 0
+        new_name_list = []
+        length_list = []
+        for i, name in enumerator:
+            if count > maxdata:
+                break
+            try:
+                motion = np.load(pjoin(motion_dir, name + ".npy"))
+                if len(motion) < self.min_motion_length or len(motion) >= self.max_motion_length:
+                    bad_count += 1
+                    continue
+                text_data = []
+                flag = False
+                with cs.open(pjoin(text_dir, name + ".txt")) as f:
+                    for line in f.readlines():
+                        text_dict = {}
+                        line_split = line.strip().split("#")
+                        caption = line_split[0]
+                        tokens = line_split[1].split(" ")
+                        f_tag = float(line_split[2])
+                        to_tag = float(line_split[3])
+                        f_tag = 0.0 if np.isnan(f_tag) else f_tag
+                        to_tag = 0.0 if np.isnan(to_tag) else to_tag
+                        text_dict["caption"] = caption
+                        text_dict["tokens"] = tokens
+                        if f_tag == 0.0 and to_tag == 0.0:
+                            flag = True
+                            text_data.append(text_dict)
+                        else:
+                            try:
+                                n_motion = motion[int(f_tag * fps): int(to_tag * fps)]
+                                if (len(n_motion)) < self.min_motion_length or \
+                                        len(n_motion) >= self.max_motion_length:
+                                    continue
+                                new_name = random.choice("ABCDEFGHIJKLMNOPQRSTUVW") + "_" + name
+                                while new_name in data_dict:
+                                    new_name = random.choice("ABCDEFGHIJKLMNOPQRSTUVW") + "_" + name
+                                data_dict[new_name] = {
+                                    "motion": n_motion,
+                                    "length": len(n_motion),
+                                    "text": [text_dict],
+                                }
+                                new_name_list.append(new_name)
+                                length_list.append(len(n_motion))
+                            except ValueError:
+                                print(line_split)
+                                print(line_split[2], line_split[3], f_tag, to_tag, name)
+                if flag:
+                    data_dict[name] = {
+                        "motion": motion,
+                        "length": len(motion),
+                        "text": text_data,
+                    }
+                    new_name_list.append(name)
+                    length_list.append(len(motion))
+                    count += 1
+            except Exception as e:
+                print(e)
+                pass
+        name_list, length_list = zip(
+            *sorted(zip(new_name_list, length_list), key=lambda x: x[1]))
+        if not tiny:
+            logger.info(f"Reading {len(self.id_list)} motions from {split_file}.")
+            logger.info(f"Total {len(name_list)} motions are used.")
+            logger.info(f"{bad_count} motion sequences not within the length range of "
+                        f"[{self.min_motion_length}, {self.max_motion_length}) are filtered out.")
+        self.mean = mean
+        self.std = std
+        control_args = kwargs['control_args']
+        self.control_mode = None
+        if os.path.exists(control_args.MEAN_STD_PATH):
+            self.raw_mean = np.load(pjoin(control_args.MEAN_STD_PATH, 'Mean_raw.npy'))
+            self.raw_std = np.load(pjoin(control_args.MEAN_STD_PATH, 'Std_raw.npy'))
+        else:
+            self.raw_mean = self.raw_std = None
+        if not tiny and control_args.CONTROL:
+            self.t_ctrl = control_args.TEMPORAL
+            self.training_control_joints = np.array(control_args.TRAIN_JOINTS)
+            self.testing_control_joints = np.array(control_args.TEST_JOINTS)
+            self.training_density = control_args.TRAIN_DENSITY
+            self.testing_density = control_args.TEST_DENSITY
+            self.control_mode = 'val' if ('test' in split_file or 'val' in split_file) else 'train'
+            if self.control_mode == 'train':
+                logger.info(f'Training Control Joints: {self.training_control_joints}')
+                logger.info(f'Training Control Density: {self.training_density}')
+            else:
+                logger.info(f'Testing Control Joints: {self.testing_control_joints}')
+                logger.info(f'Testing Control Density: {self.testing_density}')
+            logger.info(f"Temporal Control: {self.t_ctrl}")
+        self.data_dict = data_dict
+        self.name_list = name_list
+    def __len__(self) -> int:
+        return len(self.name_list)
+    def random_mask(self, joints: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+        choose_joint = self.testing_control_joints
+        length = joints.shape[0]
+        density = self.testing_density
+        if density in [1, 2, 5]:
+            choose_seq_num = density
+        else:
+            choose_seq_num = int(length * density / 100)
+        if self.t_ctrl:
+            choose_seq = np.arange(0, choose_seq_num)
+        else:
+            choose_seq = np.random.choice(length, choose_seq_num, replace=False)
+            choose_seq.sort()
+        mask_seq = np.zeros((length, self.njoints, 3))
+        for cj in choose_joint:
+            mask_seq[choose_seq, cj] = 1.0
+        joints = (joints - self.raw_mean) / self.raw_std
+        joints = joints * mask_seq
+        return joints, mask_seq
+    def random_mask_train(self, joints: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+        if self.t_ctrl:
+            choose_joint = self.training_control_joints
+        else:
+            num_joints = len(self.training_control_joints)
+            num_joints_control = 1
+            choose_joint = np.random.choice(num_joints, num_joints_control, replace=False)
+            choose_joint = self.training_control_joints[choose_joint]
+        length = joints.shape[0]
+        if self.training_density == 'random':
+            choose_seq_num = np.random.choice(length - 1, 1) + 1
+        else:
+            choose_seq_num = int(length * random.uniform(self.training_density[0], self.training_density[1]) / 100)
+        if self.t_ctrl:
+            choose_seq = np.arange(0, choose_seq_num)
+        else:
+            choose_seq = np.random.choice(length, choose_seq_num, replace=False)
+            choose_seq.sort()
+        mask_seq = np.zeros((length, self.njoints, 3))
+        for cj in choose_joint:
+            mask_seq[choose_seq, cj] = 1
+        joints = (joints - self.raw_mean) / self.raw_std
+        joints = joints * mask_seq
+        return joints, mask_seq
+    def __getitem__(self, idx: int) -> tuple:
+        data = self.data_dict[self.name_list[idx]]
+        motion, m_length, text_list = data["motion"], data["length"], data["text"]
+        # Randomly select a caption
+        text_data = random.choice(text_list)
+        caption, tokens = text_data["caption"], text_data["tokens"]
+        if len(tokens) < self.max_text_len:
+            # pad with "unk"
+            tokens = ["sos/OTHER"] + tokens + ["eos/OTHER"]
+            sent_len = len(tokens)
+            tokens = tokens + ["unk/OTHER"] * (self.max_text_len + 2 - sent_len)
+        else:
+            # crop
+            tokens = tokens[:self.max_text_len]
+            tokens = ["sos/OTHER"] + tokens + ["eos/OTHER"]
+            sent_len = len(tokens)
+        pos_one_hots = []
+        word_embeddings = []
+        for token in tokens:
+            word_emb, pos_oh = self.w_vectorizer[token]
+            pos_one_hots.append(pos_oh[None, :])
+            word_embeddings.append(word_emb[None, :])
+        pos_one_hots = np.concatenate(pos_one_hots, axis=0)
+        word_embeddings = np.concatenate(word_embeddings, axis=0)
+        # Crop the motions in to times of 4, and introduce small variations
+        if self.unit_length < 10:
+            coin2 = np.random.choice(["single", "single", "double"])
+        else:
+            coin2 = "single"
+        if coin2 == "double":
+            m_length = (m_length // self.unit_length - 1) * self.unit_length
+        elif coin2 == "single":
+            m_length = (m_length // self.unit_length) * self.unit_length
+        idx = random.randint(0, len(motion) - m_length)
+        motion = motion[idx:idx + m_length]
+        hint, hint_mask = None, None
+        if self.control_mode is not None:
+            joints = recover_from_ric(torch.from_numpy(motion).float(), self.njoints)
+            joints = joints.numpy()
+            if self.control_mode == 'train':
+                hint, hint_mask = self.random_mask_train(joints)
+            else:
+                hint, hint_mask = self.random_mask(joints)
+            if self.padding_to_max:
+                padding = np.zeros((self.max_motion_length - m_length, *hint.shape[1:]))
+                hint = np.concatenate([hint, padding], axis=0)
+                hint_mask = np.concatenate([hint_mask, padding], axis=0)
+        "Z Normalization"
+        motion = (motion - self.mean) / self.std
+        if self.padding_to_max:
+            padding = np.zeros((self.max_motion_length - m_length, motion.shape[1]))
+            motion = np.concatenate([motion, padding], axis=0)
+        return (word_embeddings,
+                pos_one_hots,
+                caption,
+                sent_len,
+                motion,
+                m_length,
+                "_".join(tokens),
+                (hint, hint_mask))

mld/data/humanml/scripts/motion_process.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+from ..common.quaternion import qinv, qrot
+# Recover global angle and positions for rotation dataset
+# root_rot_velocity (B, seq_len, 1)
+# root_linear_velocity (B, seq_len, 2)
+# root_y (B, seq_len, 1)
+# ric_data (B, seq_len, (joint_num - 1)*3)
+# rot_data (B, seq_len, (joint_num - 1)*6)
+# local_velocity (B, seq_len, joint_num*3)
+# foot contact (B, seq_len, 4)
+def recover_root_rot_pos(data: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    rot_vel = data[..., 0]
+    r_rot_ang = torch.zeros_like(rot_vel).to(data.device)
+    '''Get Y-axis rotation from rotation velocity'''
+    r_rot_ang[..., 1:] = rot_vel[..., :-1]
+    r_rot_ang = torch.cumsum(r_rot_ang, dim=-1)
+    r_rot_quat = torch.zeros(data.shape[:-1] + (4,)).to(data.device)
+    r_rot_quat[..., 0] = torch.cos(r_rot_ang)
+    r_rot_quat[..., 2] = torch.sin(r_rot_ang)
+    r_pos = torch.zeros(data.shape[:-1] + (3,)).to(data.device)
+    r_pos[..., 1:, [0, 2]] = data[..., :-1, 1:3]
+    '''Add Y-axis rotation to root position'''
+    r_pos = qrot(qinv(r_rot_quat), r_pos)
+    r_pos = torch.cumsum(r_pos, dim=-2)
+    r_pos[..., 1] = data[..., 3]
+    return r_rot_quat, r_pos
+def recover_from_ric(data: torch.Tensor, joints_num: int) -> torch.Tensor:
+    r_rot_quat, r_pos = recover_root_rot_pos(data)
+    positions = data[..., 4:(joints_num - 1) * 3 + 4]
+    positions = positions.view(positions.shape[:-1] + (-1, 3))
+    '''Add Y-axis rotation to local joints'''
+    positions = qrot(qinv(r_rot_quat[..., None, :]).expand(positions.shape[:-1] + (4,)), positions)
+    '''Add root XZ to joints'''
+    positions[..., 0] += r_pos[..., 0:1]
+    positions[..., 2] += r_pos[..., 2:3]
+    '''Concat root and joints'''
+    positions = torch.cat([r_pos.unsqueeze(-2), positions], dim=-2)
+    return positions

mld/data/humanml/utils/__init__.py ADDED Viewed

File without changes

mld/data/humanml/utils/paramUtil.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import numpy as np
+# Define a kinematic tree for the skeletal structure
+kit_kinematic_chain = [[0, 11, 12, 13, 14, 15], [0, 16, 17, 18, 19, 20], [0, 1, 2, 3, 4], [3, 5, 6, 7], [3, 8, 9, 10]]
+kit_raw_offsets = np.array(
+    [
+        [0, 0, 0],
+        [0, 1, 0],
+        [0, 1, 0],
+        [0, 1, 0],
+        [0, 1, 0],
+        [1, 0, 0],
+        [0, -1, 0],
+        [0, -1, 0],
+        [-1, 0, 0],
+        [0, -1, 0],
+        [0, -1, 0],
+        [1, 0, 0],
+        [0, -1, 0],
+        [0, -1, 0],
+        [0, 0, 1],
+        [0, 0, 1],
+        [-1, 0, 0],
+        [0, -1, 0],
+        [0, -1, 0],
+        [0, 0, 1],
+        [0, 0, 1]
+    ]
+)
+t2m_raw_offsets = np.array([[0, 0, 0],
+                            [1, 0, 0],
+                            [-1, 0, 0],
+                            [0, 1, 0],
+                            [0, -1, 0],
+                            [0, -1, 0],
+                            [0, 1, 0],
+                            [0, -1, 0],
+                            [0, -1, 0],
+                            [0, 1, 0],
+                            [0, 0, 1],
+                            [0, 0, 1],
+                            [0, 1, 0],
+                            [1, 0, 0],
+                            [-1, 0, 0],
+                            [0, 0, 1],
+                            [0, -1, 0],
+                            [0, -1, 0],
+                            [0, -1, 0],
+                            [0, -1, 0],
+                            [0, -1, 0],
+                            [0, -1, 0]])
+t2m_kinematic_chain = [[0, 2, 5, 8, 11], [0, 1, 4, 7, 10], [0, 3, 6, 9, 12, 15], [9, 14, 17, 19, 21],
+                       [9, 13, 16, 18, 20]]
+t2m_left_hand_chain = [[20, 22, 23, 24], [20, 34, 35, 36], [20, 25, 26, 27], [20, 31, 32, 33], [20, 28, 29, 30]]
+t2m_right_hand_chain = [[21, 43, 44, 45], [21, 46, 47, 48], [21, 40, 41, 42], [21, 37, 38, 39], [21, 49, 50, 51]]
+kit_tgt_skel_id = '03950'
+t2m_tgt_skel_id = '000021'

mld/data/humanml/utils/plot_script.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from textwrap import wrap
+from typing import Optional
+import numpy as np
+import matplotlib.pyplot as plt
+import mpl_toolkits.mplot3d.axes3d as p3
+from matplotlib.animation import FuncAnimation
+from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+import mld.data.humanml.utils.paramUtil as paramUtil
+skeleton = paramUtil.t2m_kinematic_chain
+def plot_3d_motion(save_path: str, joints: np.ndarray, title: str,
+                   figsize: tuple[int, int] = (3, 3),
+                   fps: int = 120, radius: int = 3, kinematic_tree: list = skeleton,
+                   hint: Optional[np.ndarray] = None) -> None:
+    title = '\n'.join(wrap(title, 20))
+    def init():
+        ax.set_xlim3d([-radius / 2, radius / 2])
+        ax.set_ylim3d([0, radius])
+        ax.set_zlim3d([-radius / 3., radius * 2 / 3.])
+        fig.suptitle(title, fontsize=10)
+        ax.grid(b=False)
+    def plot_xzPlane(minx, maxx, miny, minz, maxz):
+        # Plot a plane XZ
+        verts = [
+            [minx, miny, minz],
+            [minx, miny, maxz],
+            [maxx, miny, maxz],
+            [maxx, miny, minz]
+        ]
+        xz_plane = Poly3DCollection([verts])
+        xz_plane.set_facecolor((0.5, 0.5, 0.5, 0.5))
+        ax.add_collection3d(xz_plane)
+    # (seq_len, joints_num, 3)
+    data = joints.copy().reshape(len(joints), -1, 3)
+    data *= 1.3  # scale for visualization
+    if hint is not None:
+        mask = hint.sum(-1) != 0
+        hint = hint[mask]
+        hint *= 1.3
+    fig = plt.figure(figsize=figsize)
+    plt.tight_layout()
+    ax = p3.Axes3D(fig)
+    init()
+    MINS = data.min(axis=0).min(axis=0)
+    MAXS = data.max(axis=0).max(axis=0)
+    colors = ["#DD5A37", "#D69E00", "#B75A39", "#DD5A37", "#D69E00",
+              "#FF6D00", "#FF6D00", "#FF6D00", "#FF6D00", "#FF6D00",
+              "#DDB50E", "#DDB50E", "#DDB50E", "#DDB50E", "#DDB50E", ]
+    frame_number = data.shape[0]
+    height_offset = MINS[1]
+    data[:, :, 1] -= height_offset
+    if hint is not None:
+        hint[..., 1] -= height_offset
+    trajec = data[:, 0, [0, 2]]
+    data[..., 0] -= data[:, 0:1, 0]
+    data[..., 2] -= data[:, 0:1, 2]
+    def update(index):
+        ax.lines = []
+        ax.collections = []
+        ax.view_init(elev=120, azim=-90)
+        ax.dist = 7.5
+        plot_xzPlane(MINS[0] - trajec[index, 0], MAXS[0] - trajec[index, 0], 0, MINS[2] - trajec[index, 1],
+                     MAXS[2] - trajec[index, 1])
+        if hint is not None:
+            ax.scatter(hint[..., 0] - trajec[index, 0], hint[..., 1], hint[..., 2] - trajec[index, 1], color="#80B79A")
+        for i, (chain, color) in enumerate(zip(kinematic_tree, colors)):
+            if i < 5:
+                linewidth = 4.0
+            else:
+                linewidth = 2.0
+            ax.plot3D(data[index, chain, 0], data[index, chain, 1], data[index, chain, 2], linewidth=linewidth,
+                      color=color)
+        plt.axis('off')
+        ax.set_xticklabels([])
+        ax.set_yticklabels([])
+        ax.set_zticklabels([])
+    ani = FuncAnimation(fig, update, frames=frame_number, interval=1000 / fps, repeat=False)
+    ani.save(save_path, fps=fps)
+    plt.close()

mld/data/humanml/utils/word_vectorizer.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import pickle
+from os.path import join as pjoin
+import numpy as np
+POS_enumerator = {
+    'VERB': 0,
+    'NOUN': 1,
+    'DET': 2,
+    'ADP': 3,
+    'NUM': 4,
+    'AUX': 5,
+    'PRON': 6,
+    'ADJ': 7,
+    'ADV': 8,
+    'Loc_VIP': 9,
+    'Body_VIP': 10,
+    'Obj_VIP': 11,
+    'Act_VIP': 12,
+    'Desc_VIP': 13,
+    'OTHER': 14
+}
+Loc_list = ('left', 'right', 'clockwise', 'counterclockwise', 'anticlockwise', 'forward', 'back', 'backward',
+            'up', 'down', 'straight', 'curve')
+Body_list = ('arm', 'chin', 'foot', 'feet', 'face', 'hand', 'mouth', 'leg', 'waist', 'eye', 'knee', 'shoulder', 'thigh')
+Obj_List = ('stair', 'dumbbell', 'chair', 'window', 'floor', 'car', 'ball', 'handrail', 'baseball', 'basketball')
+Act_list = ('walk', 'run', 'swing', 'pick', 'bring', 'kick', 'put', 'squat', 'throw', 'hop', 'dance', 'jump', 'turn',
+            'stumble', 'dance', 'stop', 'sit', 'lift', 'lower', 'raise', 'wash', 'stand', 'kneel', 'stroll',
+            'rub', 'bend', 'balance', 'flap', 'jog', 'shuffle', 'lean', 'rotate', 'spin', 'spread', 'climb')
+Desc_list = ('slowly', 'carefully', 'fast', 'careful', 'slow', 'quickly', 'happy', 'angry', 'sad', 'happily',
+             'angrily', 'sadly')
+VIP_dict = {
+    'Loc_VIP': Loc_list,
+    'Body_VIP': Body_list,
+    'Obj_VIP': Obj_List,
+    'Act_VIP': Act_list,
+    'Desc_VIP': Desc_list,
+}
+class WordVectorizer(object):
+    def __init__(self, meta_root: str, prefix: str) -> None:
+        vectors = np.load(pjoin(meta_root, '%s_data.npy' % prefix))
+        words = pickle.load(open(pjoin(meta_root, '%s_words.pkl' % prefix), 'rb'))
+        word2idx = pickle.load(open(pjoin(meta_root, '%s_idx.pkl' % prefix), 'rb'))
+        self.word2vec = {w: vectors[word2idx[w]] for w in words}
+    def _get_pos_ohot(self, pos: str) -> np.ndarray:
+        pos_vec = np.zeros(len(POS_enumerator))
+        if pos in POS_enumerator:
+            pos_vec[POS_enumerator[pos]] = 1
+        else:
+            pos_vec[POS_enumerator['OTHER']] = 1
+        return pos_vec
+    def __len__(self) -> int:
+        return len(self.word2vec)
+    def __getitem__(self, item: str) -> tuple:
+        word, pos = item.split('/')
+        if word in self.word2vec:
+            word_vec = self.word2vec[word]
+            vip_pos = None
+            for key, values in VIP_dict.items():
+                if word in values:
+                    vip_pos = key
+                    break
+            if vip_pos is not None:
+                pos_vec = self._get_pos_ohot(vip_pos)
+            else:
+                pos_vec = self._get_pos_ohot(pos)
+        else:
+            word_vec = self.word2vec['unk']
+            pos_vec = self._get_pos_ohot('OTHER')
+        return word_vec, pos_vec

mld/data/utils.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+from mld.utils.temos_utils import lengths_to_mask
+def collate_tensors(batch: list) -> torch.Tensor:
+    dims = batch[0].dim()
+    max_size = [max([b.size(i) for b in batch]) for i in range(dims)]
+    size = (len(batch), ) + tuple(max_size)
+    canvas = batch[0].new_zeros(size=size)
+    for i, b in enumerate(batch):
+        sub_tensor = canvas[i]
+        for d in range(dims):
+            sub_tensor = sub_tensor.narrow(d, 0, b.size(d))
+        sub_tensor.add_(b)
+    return canvas
+def mld_collate(batch: list) -> dict:
+    notnone_batches = [b for b in batch if b is not None]
+    notnone_batches.sort(key=lambda x: x[3], reverse=True)
+    adapted_batch = {
+        "motion":
+        collate_tensors([torch.tensor(b[4]).float() for b in notnone_batches]),
+        "text": [b[2] for b in notnone_batches],
+        "length": [b[5] for b in notnone_batches],
+        "word_embs":
+        collate_tensors([torch.tensor(b[0]).float() for b in notnone_batches]),
+        "pos_ohot":
+        collate_tensors([torch.tensor(b[1]).float() for b in notnone_batches]),
+        "text_len":
+        collate_tensors([torch.tensor(b[3]) for b in notnone_batches]),
+        "tokens": [b[6] for b in notnone_batches]
+    }
+    mask = lengths_to_mask(adapted_batch['length'], adapted_batch['motion'].device, adapted_batch['motion'].shape[1])
+    adapted_batch['mask'] = mask
+    # collate trajectory
+    if notnone_batches[0][-1][0] is not None:
+        adapted_batch['hint'] = collate_tensors([torch.tensor(b[-1][0]).float() for b in notnone_batches])
+        adapted_batch['hint_mask'] = collate_tensors([torch.tensor(b[-1][1]).float() for b in notnone_batches])
+    return adapted_batch
+def mld_collate_motion_only(batch: list) -> dict:
+    batch = {
+        "motion": collate_tensors([torch.tensor(b[0]).float() for b in batch]),
+        "length": [b[1] for b in batch]
+    }
+    return batch

mld/launch/__init__.py ADDED Viewed

File without changes

mld/launch/blender.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Fix blender path
+import os
+import sys
+from argparse import ArgumentParser
+sys.path.append(os.path.expanduser("~/.local/lib/python3.9/site-packages"))
+# Monkey patch argparse such that
+# blender / python parsing works
+def parse_args(self, args=None, namespace=None):
+    if args is not None:
+        return self.parse_args_bak(args=args, namespace=namespace)
+    try:
+        idx = sys.argv.index("--")
+        args = sys.argv[idx + 1:]  # the list after '--'
+    except ValueError as e:  # '--' not in the list:
+        args = []
+    return self.parse_args_bak(args=args, namespace=namespace)
+setattr(ArgumentParser, 'parse_args_bak', ArgumentParser.parse_args)
+setattr(ArgumentParser, 'parse_args', parse_args)

mld/models/__init__.py ADDED Viewed

File without changes

mld/models/architectures/__init__.py ADDED Viewed

File without changes

mld/models/architectures/dno.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import torch
+import torch.nn.functional as F
+from torch.utils.tensorboard import SummaryWriter
+class DNO(object):
+    def __init__(
+            self,
+            optimize: bool,
+            max_train_steps: int,
+            learning_rate: float,
+            lr_scheduler: str,
+            lr_warmup_steps: int,
+            clip_grad: bool,
+            loss_hint_type: str,
+            loss_diff_penalty: float,
+            loss_correlate_penalty: float,
+            visualize_samples: int,
+            visualize_ske_steps: list[int],
+            output_dir: str
+    ) -> None:
+        self.optimize = optimize
+        self.max_train_steps = max_train_steps
+        self.learning_rate = learning_rate
+        self.lr_scheduler = lr_scheduler
+        self.lr_warmup_steps = lr_warmup_steps
+        self.clip_grad = clip_grad
+        self.loss_hint_type = loss_hint_type
+        self.loss_diff_penalty = loss_diff_penalty
+        self.loss_correlate_penalty = loss_correlate_penalty
+        if loss_hint_type == 'l1':
+            self.loss_hint_func = F.l1_loss
+        elif loss_hint_type == 'l1_smooth':
+            self.loss_hint_func = F.smooth_l1_loss
+        elif loss_hint_type == 'l2':
+            self.loss_hint_func = F.mse_loss
+        else:
+            raise ValueError(f'Invalid loss type: {loss_hint_type}')
+        self.visualize_samples = float('inf') if visualize_samples == 'inf' else visualize_samples
+        assert self.visualize_samples >= 0
+        self.visualize_samples_done = 0
+        self.visualize_ske_steps = visualize_ske_steps
+        if len(visualize_ske_steps) > 0:
+            self.vis_dir = os.path.join(output_dir, 'vis_optimize')
+            os.makedirs(self.vis_dir)
+        self.writer = None
+        self.output_dir = output_dir
+        if self.visualize_samples > 0:
+            self.writer = SummaryWriter(output_dir)
+    @property
+    def do_visualize(self):
+        return self.visualize_samples_done < self.visualize_samples
+    @staticmethod
+    def noise_regularize_1d(noise: torch.Tensor, stop_at: int = 2, dim: int = 1) -> torch.Tensor:
+        size = noise.shape[dim]
+        if size & (size - 1) != 0:
+            new_size = 2 ** (size - 1).bit_length()
+            pad = new_size - size
+            pad_shape = list(noise.shape)
+            pad_shape[dim] = pad
+            pad_noise = torch.randn(*pad_shape, device=noise.device)
+            noise = torch.cat([noise, pad_noise], dim=dim)
+            size = noise.shape[dim]
+        loss = torch.zeros(noise.shape[0], device=noise.device)
+        while size > stop_at:
+            rolled_noise = torch.roll(noise, shifts=1, dims=dim)
+            loss += (noise * rolled_noise).mean(dim=tuple(range(1, noise.ndim))).pow(2)
+            noise = noise.view(*noise.shape[:dim], size // 2, 2, *noise.shape[dim + 1:]).mean(dim=dim + 1)
+            size //= 2
+        return loss

mld/models/architectures/mld_clip.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import torch
+import torch.nn as nn
+from transformers import AutoModel, AutoTokenizer
+from sentence_transformers import SentenceTransformer
+class MldTextEncoder(nn.Module):
+    def __init__(self, modelpath: str, last_hidden_state: bool = False) -> None:
+        super().__init__()
+        if 't5' in modelpath:
+            self.text_model = SentenceTransformer(modelpath)
+            self.tokenizer = self.text_model.tokenizer
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(modelpath)
+            self.text_model = AutoModel.from_pretrained(modelpath)
+        self.max_length = self.tokenizer.model_max_length
+        if "clip" in modelpath:
+            self.text_encoded_dim = self.text_model.config.text_config.hidden_size
+            if last_hidden_state:
+                self.name = "clip_hidden"
+            else:
+                self.name = "clip"
+        elif "bert" in modelpath:
+            self.name = "bert"
+            self.text_encoded_dim = self.text_model.config.hidden_size
+        elif 't5' in modelpath:
+            self.name = 't5'
+        else:
+            raise ValueError(f"Model {modelpath} not supported")
+    def forward(self, texts: list[str]) -> torch.Tensor:
+        # get prompt text embeddings
+        if self.name in ["clip", "clip_hidden"]:
+            text_inputs = self.tokenizer(
+                texts,
+                padding="max_length",
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            # split into max length Clip can handle
+            if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+                text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length]
+        elif self.name == "bert":
+            text_inputs = self.tokenizer(texts, return_tensors="pt", padding=True)
+        if self.name == "clip":
+            # (batch_Size, text_encoded_dim)
+            text_embeddings = self.text_model.get_text_features(
+                text_input_ids.to(self.text_model.device))
+            # (batch_Size, 1, text_encoded_dim)
+            text_embeddings = text_embeddings.unsqueeze(1)
+        elif self.name == "clip_hidden":
+            # (batch_Size, seq_length , text_encoded_dim)
+            text_embeddings = self.text_model.text_model(
+                text_input_ids.to(self.text_model.device)).last_hidden_state
+        elif self.name == "bert":
+            # (batch_Size, seq_length , text_encoded_dim)
+            text_embeddings = self.text_model(
+                **text_inputs.to(self.text_model.device)).last_hidden_state
+        elif self.name == 't5':
+            text_embeddings = self.text_model.encode(texts, show_progress_bar=False, convert_to_tensor=True, batch_size=len(texts))
+            text_embeddings = text_embeddings.unsqueeze(1)
+        else:
+            raise NotImplementedError(f"Model {self.name} not implemented")
+        return text_embeddings

mld/models/architectures/mld_denoiser.py ADDED Viewed

	@@ -0,0 +1,200 @@

+from typing import Optional, Union
+import torch
+import torch.nn as nn
+from mld.models.operator.embeddings import TimestepEmbedding, Timesteps
+from mld.models.operator.attention import (SkipTransformerEncoder,
+                                           SkipTransformerDecoder,
+                                           TransformerDecoder,
+                                           TransformerDecoderLayer,
+                                           TransformerEncoder,
+                                           TransformerEncoderLayer)
+from mld.models.operator.moe import MoeTransformerEncoderLayer, MoeTransformerDecoderLayer
+from mld.models.operator.utils import get_clones, get_activation_fn, zero_module
+from mld.models.operator.position_encoding import build_position_encoding
+def load_balancing_loss_func(router_logits: tuple, num_experts: int = 4, topk: int = 2):
+    router_logits = torch.cat(router_logits, dim=0)
+    routing_weights = torch.nn.functional.softmax(router_logits, dim=-1)
+    _, selected_experts = torch.topk(routing_weights, topk, dim=-1)
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+    tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+    router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    overall_loss = num_experts * torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss
+class MldDenoiser(nn.Module):
+    def __init__(self,
+                 latent_dim: list = [1, 256],
+                 hidden_dim: Optional[int] = None,
+                 text_dim: int = 768,
+                 time_dim: int = 768,
+                 ff_size: int = 1024,
+                 num_layers: int = 9,
+                 num_heads: int = 4,
+                 dropout: float = 0.1,
+                 normalize_before: bool = False,
+                 norm_eps: float = 1e-5,
+                 activation: str = "gelu",
+                 norm_post: bool = True,
+                 activation_post: Optional[str] = None,
+                 flip_sin_to_cos: bool = True,
+                 freq_shift: float = 0,
+                 time_act_fn: str = 'silu',
+                 time_post_act_fn: Optional[str] = None,
+                 position_embedding: str = "learned",
+                 arch: str = "trans_enc",
+                 add_mem_pos: bool = True,
+                 force_pre_post_proj: bool = False,
+                 text_act_fn: str = 'relu',
+                 time_cond_proj_dim: Optional[int] = None,
+                 zero_init_cond: bool = True,
+                 is_controlnet: bool = False,
+                 controlnet_embed_dim: Optional[int] = None,
+                 controlnet_act_fn: str = 'silu',
+                 moe: bool = False,
+                 moe_num_experts: int = 4,
+                 moe_topk: int = 2,
+                 moe_loss_weight: float = 1e-2,
+                 moe_jitter_noise: Optional[float] = None
+                 ) -> None:
+        super(MldDenoiser, self).__init__()
+        self.latent_dim = latent_dim[-1] if hidden_dim is None else hidden_dim
+        add_pre_post_proj = force_pre_post_proj or (hidden_dim is not None and hidden_dim != latent_dim[-1])
+        self.latent_pre = nn.Linear(latent_dim[-1], self.latent_dim) if add_pre_post_proj else nn.Identity()
+        self.latent_post = nn.Linear(self.latent_dim, latent_dim[-1]) if add_pre_post_proj else nn.Identity()
+        self.arch = arch
+        self.time_cond_proj_dim = time_cond_proj_dim
+        self.moe_num_experts = moe_num_experts
+        self.moe_topk = moe_topk
+        self.moe_loss_weight = moe_loss_weight
+        self.time_proj = Timesteps(time_dim, flip_sin_to_cos, freq_shift)
+        self.time_embedding = TimestepEmbedding(time_dim, self.latent_dim, time_act_fn, post_act_fn=time_post_act_fn,
+                                                cond_proj_dim=time_cond_proj_dim, zero_init_cond=zero_init_cond)
+        self.emb_proj = nn.Sequential(get_activation_fn(text_act_fn), nn.Linear(text_dim, self.latent_dim))
+        self.query_pos = build_position_encoding(self.latent_dim, position_embedding=position_embedding)
+        if self.arch == "trans_enc":
+            if moe:
+                encoder_layer = MoeTransformerEncoderLayer(
+                    self.latent_dim, num_heads, moe_num_experts, moe_topk, ff_size,
+                    dropout, activation, normalize_before, norm_eps, moe_jitter_noise)
+            else:
+                encoder_layer = TransformerEncoderLayer(
+                    self.latent_dim, num_heads, ff_size, dropout,
+                    activation, normalize_before, norm_eps)
+            encoder_norm = nn.LayerNorm(self.latent_dim, eps=norm_eps) if norm_post and not is_controlnet else None
+            self.encoder = SkipTransformerEncoder(encoder_layer, num_layers, encoder_norm, activation_post,
+                                                  is_controlnet=is_controlnet, is_moe=moe)
+        elif self.arch == 'trans_dec':
+            if add_mem_pos:
+                self.mem_pos = build_position_encoding(self.latent_dim, position_embedding=position_embedding)
+            else:
+                self.mem_pos = None
+            if moe:
+                decoder_layer = MoeTransformerDecoderLayer(
+                    self.latent_dim, num_heads, moe_num_experts, moe_topk, ff_size,
+                    dropout, activation, normalize_before, norm_eps, moe_jitter_noise)
+            else:
+                decoder_layer = TransformerDecoderLayer(
+                    self.latent_dim, num_heads, ff_size, dropout,
+                    activation, normalize_before, norm_eps)
+            decoder_norm = nn.LayerNorm(self.latent_dim, eps=norm_eps) if norm_post and not is_controlnet else None
+            self.decoder = SkipTransformerDecoder(decoder_layer, num_layers, decoder_norm, activation_post,
+                                                  is_controlnet=is_controlnet, is_moe=moe)
+        else:
+            raise ValueError(f"Not supported architecture: {self.arch}!")
+        self.is_controlnet = is_controlnet
+        if self.is_controlnet:
+            embed_dim = controlnet_embed_dim if controlnet_embed_dim is not None else self.latent_dim
+            modules = [
+                nn.Linear(latent_dim[-1], embed_dim),
+                get_activation_fn(controlnet_act_fn) if controlnet_act_fn else None,
+                nn.Linear(embed_dim, embed_dim),
+                get_activation_fn(controlnet_act_fn) if controlnet_act_fn else None,
+                zero_module(nn.Linear(embed_dim, latent_dim[-1]))
+            ]
+            self.controlnet_cond_embedding = nn.Sequential(*[m for m in modules if m is not None])
+            self.controlnet_down_mid_blocks = nn.ModuleList([
+                zero_module(nn.Linear(self.latent_dim, self.latent_dim)) for _ in range(num_layers)])
+    def forward(self,
+                sample: torch.Tensor,
+                timestep: torch.Tensor,
+                encoder_hidden_states: torch.Tensor,
+                timestep_cond: Optional[torch.Tensor] = None,
+                controlnet_cond: Optional[torch.Tensor] = None,
+                controlnet_residuals: Optional[list[torch.Tensor]] = None
+                ) -> tuple:
+        # 0. check if controlnet
+        if self.is_controlnet:
+            sample = sample + self.controlnet_cond_embedding(controlnet_cond)
+        # 1. dimension matching (pre)
+        sample = sample.permute(1, 0, 2)
+        sample = self.latent_pre(sample)
+        # 2. time_embedding
+        timesteps = timestep.expand(sample.shape[1]).clone()
+        time_emb = self.time_proj(timesteps)
+        time_emb = time_emb.to(dtype=sample.dtype)
+        # [1, bs, latent_dim] <= [bs, latent_dim]
+        time_emb = self.time_embedding(time_emb, timestep_cond).unsqueeze(0)
+        # 3. condition + time embedding
+        # text_emb [seq_len, batch_size, text_dim] <= [batch_size, seq_len, text_dim]
+        encoder_hidden_states = encoder_hidden_states.permute(1, 0, 2)
+        # text embedding projection
+        text_emb_latent = self.emb_proj(encoder_hidden_states)
+        emb_latent = torch.cat((time_emb, text_emb_latent), 0)
+        # 4. transformer
+        if self.arch == "trans_enc":
+            xseq = torch.cat((sample, emb_latent), axis=0)
+            xseq = self.query_pos(xseq)
+            tokens, intermediates, router_logits = self.encoder(xseq, controlnet_residuals=controlnet_residuals)
+        elif self.arch == 'trans_dec':
+            sample = self.query_pos(sample)
+            if self.mem_pos:
+                emb_latent = self.mem_pos(emb_latent)
+            tokens, intermediates, router_logits = self.decoder(sample, emb_latent,
+                                                                controlnet_residuals=controlnet_residuals)
+        else:
+            raise TypeError(f"{self.arch} is not supported")
+        router_loss = None
+        if router_logits is not None:
+            router_loss = load_balancing_loss_func(router_logits, self.moe_num_experts, self.moe_topk)
+            router_loss = self.moe_loss_weight * router_loss
+        if self.is_controlnet:
+            control_res_samples = []
+            for res, block in zip(intermediates, self.controlnet_down_mid_blocks):
+                r = block(res)
+                control_res_samples.append(r)
+            return control_res_samples, router_loss
+        elif self.arch == "trans_enc":
+            sample = tokens[:sample.shape[0]]
+        elif self.arch == 'trans_dec':
+            sample = tokens
+        else:
+            raise TypeError(f"{self.arch} is not supported")
+        # 5. dimension matching (post)
+        sample = self.latent_post(sample)
+        sample = sample.permute(1, 0, 2)
+        return sample, router_loss

mld/models/architectures/mld_traj_encoder.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+from mld.models.operator.attention import SkipTransformerEncoder, TransformerEncoderLayer
+from mld.models.operator.position_encoding import build_position_encoding
+class MldTrajEncoder(nn.Module):
+    def __init__(self,
+                 nfeats: int,
+                 latent_dim: list = [1, 256],
+                 hidden_dim: Optional[int] = None,
+                 force_post_proj: bool = False,
+                 ff_size: int = 1024,
+                 num_layers: int = 9,
+                 num_heads: int = 4,
+                 dropout: float = 0.1,
+                 normalize_before: bool = False,
+                 norm_eps: float = 1e-5,
+                 activation: str = "gelu",
+                 norm_post: bool = True,
+                 activation_post: Optional[str] = None,
+                 position_embedding: str = "learned") -> None:
+        super(MldTrajEncoder, self).__init__()
+        self.latent_size = latent_dim[0]
+        self.latent_dim = latent_dim[-1] if hidden_dim is None else hidden_dim
+        add_post_proj = force_post_proj or (hidden_dim is not None and hidden_dim != latent_dim[-1])
+        self.latent_proj = nn.Linear(self.latent_dim, latent_dim[-1]) if add_post_proj else nn.Identity()
+        self.skel_embedding = nn.Linear(nfeats * 3, self.latent_dim)
+        self.query_pos_encoder = build_position_encoding(
+            self.latent_dim, position_embedding=position_embedding)
+        encoder_layer = TransformerEncoderLayer(
+            self.latent_dim,
+            num_heads,
+            ff_size,
+            dropout,
+            activation,
+            normalize_before,
+            norm_eps
+        )
+        encoder_norm = nn.LayerNorm(self.latent_dim, eps=norm_eps) if norm_post else None
+        self.encoder = SkipTransformerEncoder(encoder_layer, num_layers, encoder_norm, activation_post)
+        self.global_motion_token = nn.Parameter(torch.randn(self.latent_size, self.latent_dim))
+    def forward(self, features: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
+        bs, nframes, nfeats = features.shape
+        x = self.skel_embedding(features)
+        x = x.permute(1, 0, 2)
+        dist = torch.tile(self.global_motion_token[:, None, :], (1, bs, 1))
+        dist_masks = torch.ones((bs, dist.shape[0]), dtype=torch.bool, device=x.device)
+        aug_mask = torch.cat((dist_masks, mask), 1)
+        xseq = torch.cat((dist, x), 0)
+        xseq = self.query_pos_encoder(xseq)
+        global_token = self.encoder(xseq, src_key_padding_mask=~aug_mask)[0][:dist.shape[0]]
+        global_token = self.latent_proj(global_token)
+        global_token = global_token.permute(1, 0, 2)
+        return global_token

mld/models/architectures/mld_vae.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+from torch.distributions.distribution import Distribution
+from mld.models.operator.attention import (
+    SkipTransformerEncoder,
+    SkipTransformerDecoder,
+    TransformerDecoder,
+    TransformerDecoderLayer,
+    TransformerEncoder,
+    TransformerEncoderLayer
+)
+from mld.models.operator.position_encoding import build_position_encoding
+class MldVae(nn.Module):
+    def __init__(self,
+                 nfeats: int,
+                 latent_dim: list = [1, 256],
+                 hidden_dim: Optional[int] = None,
+                 force_pre_post_proj: bool = False,
+                 ff_size: int = 1024,
+                 num_layers: int = 9,
+                 num_heads: int = 4,
+                 dropout: float = 0.1,
+                 arch: str = "encoder_decoder",
+                 normalize_before: bool = False,
+                 norm_eps: float = 1e-5,
+                 activation: str = "gelu",
+                 norm_post: bool = True,
+                 activation_post: Optional[str] = None,
+                 position_embedding: str = "learned") -> None:
+        super(MldVae, self).__init__()
+        self.latent_size = latent_dim[0]
+        self.latent_dim = latent_dim[-1] if hidden_dim is None else hidden_dim
+        add_pre_post_proj = force_pre_post_proj or (hidden_dim is not None and hidden_dim != latent_dim[-1])
+        self.latent_pre = nn.Linear(self.latent_dim, latent_dim[-1]) if add_pre_post_proj else nn.Identity()
+        self.latent_post = nn.Linear(latent_dim[-1], self.latent_dim) if add_pre_post_proj else nn.Identity()
+        self.arch = arch
+        self.query_pos_encoder = build_position_encoding(
+            self.latent_dim, position_embedding=position_embedding)
+        encoder_layer = TransformerEncoderLayer(
+            self.latent_dim,
+            num_heads,
+            ff_size,
+            dropout,
+            activation,
+            normalize_before,
+            norm_eps
+        )
+        encoder_norm = nn.LayerNorm(self.latent_dim, eps=norm_eps) if norm_post else None
+        self.encoder = SkipTransformerEncoder(encoder_layer, num_layers, encoder_norm, activation_post)
+        if self.arch == "all_encoder":
+            decoder_norm = nn.LayerNorm(self.latent_dim, eps=norm_eps) if norm_post else None
+            self.decoder = SkipTransformerEncoder(encoder_layer, num_layers, decoder_norm, activation_post)
+        elif self.arch == 'encoder_decoder':
+            self.query_pos_decoder = build_position_encoding(
+                self.latent_dim, position_embedding=position_embedding)
+            decoder_layer = TransformerDecoderLayer(
+                self.latent_dim,
+                num_heads,
+                ff_size,
+                dropout,
+                activation,
+                normalize_before,
+                norm_eps
+            )
+            decoder_norm = nn.LayerNorm(self.latent_dim, eps=norm_eps) if norm_post else None
+            self.decoder = SkipTransformerDecoder(decoder_layer, num_layers, decoder_norm, activation_post)
+        else:
+            raise ValueError(f"Not support architecture: {self.arch}!")
+        self.global_motion_token = nn.Parameter(torch.randn(self.latent_size * 2, self.latent_dim))
+        self.skel_embedding = nn.Linear(nfeats, self.latent_dim)
+        self.final_layer = nn.Linear(self.latent_dim, nfeats)
+    def forward(self, features: torch.Tensor, mask: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, Distribution]:
+        z, dist = self.encode(features, mask)
+        feats_rst = self.decode(z, mask)
+        return feats_rst, z, dist
+    def encode(self, features: torch.Tensor, mask: torch.Tensor) -> tuple[torch.Tensor, Distribution]:
+        bs, nframes, nfeats = features.shape
+        x = self.skel_embedding(features)
+        x = x.permute(1, 0, 2)
+        dist = torch.tile(self.global_motion_token[:, None, :], (1, bs, 1))
+        dist_masks = torch.ones((bs, dist.shape[0]), dtype=torch.bool, device=x.device)
+        aug_mask = torch.cat((dist_masks, mask), 1)
+        xseq = torch.cat((dist, x), 0)
+        xseq = self.query_pos_encoder(xseq)
+        dist = self.encoder(xseq, src_key_padding_mask=~aug_mask)[0][:dist.shape[0]]
+        dist = self.latent_pre(dist)
+        mu = dist[0:self.latent_size, ...]
+        logvar = dist[self.latent_size:, ...]
+        std = logvar.exp().pow(0.5)
+        dist = torch.distributions.Normal(mu, std)
+        latent = dist.rsample()
+        # [latent_dim[0], batch_size, latent_dim] -> [batch_size, latent_dim[0], latent_dim[1]]
+        latent = latent.permute(1, 0, 2)
+        return latent, dist
+    def decode(self, z: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        # [batch_size, latent_dim[0], latent_dim[1]] -> [latent_dim[0], batch_size, latent_dim[1]]
+        z = self.latent_post(z)
+        z = z.permute(1, 0, 2)
+        bs, nframes = mask.shape
+        queries = torch.zeros(nframes, bs, self.latent_dim, device=z.device)
+        if self.arch == "all_encoder":
+            xseq = torch.cat((z, queries), axis=0)
+            z_mask = torch.ones((bs, self.latent_size), dtype=torch.bool, device=z.device)
+            aug_mask = torch.cat((z_mask, mask), axis=1)
+            xseq = self.query_pos_decoder(xseq)
+            output = self.decoder(xseq, src_key_padding_mask=~aug_mask)[0][z.shape[0]:]
+        elif self.arch == "encoder_decoder":
+            queries = self.query_pos_decoder(queries)
+            output = self.decoder(tgt=queries, memory=z, tgt_key_padding_mask=~mask)[0]
+        else:
+            raise ValueError(f"Not support architecture: {self.arch}!")
+        output = self.final_layer(output)
+        output[~mask.T] = 0
+        feats = output.permute(1, 0, 2)
+        return feats