File size: 2,070 Bytes
f392320
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
random_seed: 1

WANDB:
  project: StructDiffusion
  save_dir: ${base_dirs.wandb_dir}
  name: conditional_pose_diffusion_language_shuffle

DATASET:
  data_root: ${base_dirs.data}
  vocab_dir: ${base_dirs.data}/type_vocabs_coarse.json

  # important
  use_virtual_structure_frame: True
  ignore_distractor_objects: True
  ignore_rgb: True

  # the following are determined by the dataset
  max_num_target_objects: 7
  max_num_distractor_objects: 5
  # set to 1 because we use sentence embedding, which only takes one spot in the input seq to transformer diffusion
  max_num_shape_parameters: 1
  # set to zeros because they are not used for now
  max_num_rearrange_features: 0
  max_num_anchor_features: 0

  # language
  sentence_embedding_file: ${base_dirs.data}/template_sentence_data.pkl
  use_incomplete_sentence: True

  # shuffle
  shuffle_object_index: True

  num_pts: 1024
  filter_num_moved_objects_range:
  data_augmentation: False

DATALOADER:
  batch_size: 64
  num_workers: 8
  pin_memory: True

MODEL:
  # transformer encoder
  encoder_input_dim: 256
  num_attention_heads: 8
  encoder_hidden_dim: 512
  encoder_dropout: 0.0
  encoder_activation: relu
  encoder_num_layers: 8
  # output head
  structure_dropout: 0
  object_dropout: 0
  # pc encoder
  ignore_rgb: ${DATASET.ignore_rgb}
  pc_emb_dim: 256
  posed_pc_emb_dim: 80
  # pose encoder
  pose_emb_dim: 80
  # language
  word_emb_dim: 160
  # diffusion step
  time_emb_dim: 80
  # sequence embeddings
  # max_num_target_objects (+ max_num_distractor_objects if not ignore_distractor_objects)
  max_seq_size: 7
  max_token_type_size: 4
  seq_pos_emb_dim: 8
  seq_type_emb_dim: 8
  # virtual frame
  use_virtual_structure_frame: ${DATASET.use_virtual_structure_frame}
  # language
  use_sentence_embedding: True
  sentence_embedding_dim: 384

NOISE_SCHEDULE:
  timesteps: 200

LOSS:
  type: huber

OPTIMIZER:
  lr: 0.0001
  weight_decay: 0  #0.0001
  # lr_restart: 3000
  # warmup: 10

TRAINER:
  max_epochs: 200
  gradient_clip_val: 1.0
  gpus: 1
  deterministic: False
  # enable_progress_bar: False