File size: 2,360 Bytes
9839b09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
CartPole-v1: &cartpole-defaults
  n_timesteps: !!float 5e4
  env_hyperparams:
    rolling_length: 50
  policy_hyperparams:
    hidden_sizes: [256, 256]
  algo_hyperparams:
    learning_rate: !!float 2.3e-3
    batch_size: 64
    buffer_size: 100000
    learning_starts: 1000
    gamma: 0.99
    target_update_interval: 10
    train_freq: 256
    gradient_steps: 128
    exploration_fraction: 0.16
    exploration_final_eps: 0.04
  eval_params:
    step_freq: !!float 1e4

CartPole-v0:
  <<: *cartpole-defaults
  n_timesteps: !!float 4e4

MountainCar-v0:
  n_timesteps: !!float 1.2e5
  env_hyperparams:
    rolling_length: 50
  policy_hyperparams:
    hidden_sizes: [256, 256]
  algo_hyperparams:
    learning_rate: !!float 4e-3
    batch_size: 128
    buffer_size: 10000
    learning_starts: 1000
    gamma: 0.98
    target_update_interval: 600
    train_freq: 16
    gradient_steps: 8
    exploration_fraction: 0.2
    exploration_final_eps: 0.07

Acrobot-v1:
  n_timesteps: !!float 1e5
  env_hyperparams:
    rolling_length: 50
  policy_hyperparams:
    hidden_sizes: [256, 256]
  algo_hyperparams:
    learning_rate: !!float 6.3e-4
    batch_size: 128
    buffer_size: 50000
    learning_starts: 0
    gamma: 0.99
    target_update_interval: 250
    train_freq: 4
    gradient_steps: -1
    exploration_fraction: 0.12
    exploration_final_eps: 0.1

LunarLander-v2:
  n_timesteps: !!float 5e5
  env_hyperparams:
    rolling_length: 50
  policy_hyperparams:
    hidden_sizes: [256, 256]
  algo_hyperparams:
    learning_rate: !!float 1e-4
    batch_size: 256
    buffer_size: 100000
    learning_starts: 10000
    gamma: 0.99
    target_update_interval: 250
    train_freq: 8
    gradient_steps: -1
    exploration_fraction: 0.12
    exploration_final_eps: 0.1
    max_grad_norm: 0.5
  eval_params:
    step_freq: 25_000

atari: &atari-defaults
  n_timesteps: !!float 1e7
  env_hyperparams:
    frame_stack: 4
    no_reward_timeout_steps: 1_000
    n_envs: 8
    vec_env_class: "subproc"
  algo_hyperparams:
    buffer_size: 100000
    learning_rate: !!float 1e-4
    batch_size: 32
    learning_starts: 100000
    target_update_interval: 1000
    train_freq: 8
    gradient_steps: 2
    exploration_fraction: 0.1
    exploration_final_eps: 0.01
  eval_params:
    deterministic: false

PongNoFrameskip-v4:
  <<: *atari-defaults
  n_timesteps: !!float 2.5e6