File size: 7,294 Bytes
a162e39 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import numpy as np
from stable_baselines3 import SAC
from stable_baselines3.sac.policies import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env
import os
from monitor_wrap import MonitorWrapper
from filter_wrap import FilterWrapper
from distribution_wrap import DistriWrapper
from redux_wrap import ReduxWrapper
from symetry_wrap import SymetryWrapper
from rotate_wrap import RotateWrapper
from sort_wrap import SortWrapper
from team_wrap import TeamWrapper
from reward_wrap import RewardWrapper
from settings import Settings
from swarmenv import SwarmEnv
import param_
def bi_train(blue_model, red_model, blues: int = 1, reds: int = 1,
blue_dispersion: np.float32 = 1, red_dispersion: np.float32 = 1, total_timesteps: int = 1000):
# If needed create save dir
save_dir = "policies/" + Settings.policy_folder + f"/b{blues}r{reds}/"
save_last_dir = "policies/last" + f"/b{blues}r{reds}/"
os.makedirs(save_dir, exist_ok=True)
os.makedirs(save_last_dir, exist_ok=True)
# set the dispersion to initial drone positions
Settings.blue_distance_factor = blue_dispersion * Settings.blue_distance_factor
Settings.red_distance_factor = red_dispersion * Settings.red_distance_factor
Settings.red_theta_noise = red_dispersion * Settings.red_theta_noise
Settings.red_rho_noise = red_dispersion * Settings.red_rho_noise
# launch learning for red drones and then blue drones
red_model.learn(total_timesteps=total_timesteps)
mean_reward, std_reward = evaluate_policy(red_model, red_model.env, n_eval_episodes=10)
print(f"REDS b{blues}r{reds} disp_b:{10*blue_dispersion:2.0f} disp_r{10*red_dispersion:2.0f}: "
f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
red_model.save(save_dir + f"reds_b{10 * blue_dispersion:2.0f}r{10 * red_dispersion:2.0f}")
red_model.save(save_last_dir + "reds_last")
blue_model.learn(total_timesteps=total_timesteps)
mean_reward, std_reward = evaluate_policy(blue_model, blue_model.env, n_eval_episodes=10)
print(f"BLUES b{blues}r{reds} disp_b:{10*blue_dispersion:2.0f} disp_r{10*red_dispersion:2.0f}: "
f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
blue_model.save(save_dir + f"blues_{10 * blue_dispersion:2.0f}r{10 * red_dispersion:2.0f}")
blue_model.save(save_last_dir + "blues_last")
return blue_model, red_model
def meta_train(blues: int = 1, reds: int = 1,
max_dispersion: np.float32 = 3, iteration: int = 10,
total_timesteps: int = 100):
Settings.blues, Settings.reds = blues, reds
# launch the episode to get the data
steps = int(param_.DURATION / param_.STEP)
env = SortWrapper(
SymetryWrapper(
RotateWrapper(
ReduxWrapper(
DistriWrapper(
FilterWrapper(
MonitorWrapper(
SwarmEnv(blues=blues, reds=reds), steps, verbose=False)))))))
blue_env = RewardWrapper(TeamWrapper(env, is_blue=True), is_blue=True)
red_env = RewardWrapper(TeamWrapper(env, is_blue=False), is_blue=False)
blue_model = SAC(MlpPolicy, blue_env, verbose=0)
red_model = SAC(MlpPolicy, red_env, verbose=0)
for red_dispersion in np.linspace(0.1, max_dispersion, num=iteration):
for blue_dispersion in np.linspace(max_dispersion, 0.3, num=iteration):
blue_model, red_model = bi_train(
blue_model, red_model, blues=blues, reds=reds,
blue_dispersion=blue_dispersion, red_dispersion=red_dispersion,
total_timesteps=total_timesteps)
def super_meta_train(max_blues: int = 3, max_reds: int = 3, max_dispersion: np.float32 = 3,
iteration: int = 10, total_timesteps: int = 100, policy_folder: str = "default"):
Settings.policy_folder = policy_folder
for drones_nb in range(2, max_blues + max_reds + 1):
for blues in range(1, max_blues + 1):
reds = drones_nb - blues
if 1 <= reds <= max_reds:
print(f"reds :{reds}, blues: {blues}")
meta_train(blues=blues, reds=reds,
max_dispersion=max_dispersion, iteration=iteration, total_timesteps=total_timesteps)
def print_spaces(env, name: str):
print("++++++++++++")
print(name)
print(env.action_space)
print(env.observation_space)
print("============")
check_env(env, warn=True)
# super_meta_train(max_blues=1, max_reds=1, iteration=5, max_dispersion=1, total_timesteps=50000, policy_folder="0528_14")
# super_meta_train(max_blues=2, max_reds=2, iteration=4, max_dispersion=3, total_timesteps=10, policy_folder="0528_test")
def simple_red_train(max_dispersion: np.float32 = 3,
blues: int = 1, reds: int = 1,
iteration: int = 25, total_timesteps: int = 100,
policy_folder: str = "simple_red"):
Settings.policy_folder = policy_folder
print(f"Simple_red: reds :{reds}, blues: {blues}")
# If needed create save dir
save_dir = "policies/" + Settings.policy_folder + f"/b{blues}r{reds}/"
save_last_dir = "policies/last" + f"/b{blues}r{reds}/"
os.makedirs(save_dir, exist_ok=True)
os.makedirs(save_last_dir, exist_ok=True)
# launch the episode to get the data
steps = int(param_.DURATION / param_.STEP)
Settings.blues, Settings.reds = blues, reds
env = SortWrapper(
SymetryWrapper(
RotateWrapper(
ReduxWrapper(
DistriWrapper(
FilterWrapper(
MonitorWrapper(
SwarmEnv(blues=blues, reds=reds), steps, verbose=False)))))))
red_env = RewardWrapper(TeamWrapper(env, is_blue=False), is_blue=False)
red_model = SAC(MlpPolicy, red_env, verbose=1)
# set the dispersion to initial drone positions
Settings.blue_distance_factor = 10 * Settings.blue_distance_factor
this_iteration = 0
for red_dispersion in np.linspace(0.33, max_dispersion, num=iteration):
Settings.red_distance_factor = red_dispersion
# launch learning for red drones and then blue drones
this_iteration += 1
batch = 1
mean_reward = 0
delta_reward = 0
stability = 0
count = 0
while mean_reward < 9 or stability < 3 or count < 30:
count += 1
red_model.learn(total_timesteps=total_timesteps//10)
last_reward = mean_reward
mean_reward, std_reward = evaluate_policy(red_model, red_model.env, n_eval_episodes=100)
delta_reward = mean_reward - last_reward
if -0.1 <= delta_reward <= 0.1:
stability += 1
else:
stability = 0
print(f"REDS b{blues}r{reds} iteration{this_iteration} batch{batch}: "
f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
red_model.save(save_dir + f"{this_iteration} batch{batch+1}")
red_model.save(save_last_dir + "reds_last")
batch += 1
simple_red_train(total_timesteps = 50000, policy_folder="simply_red")
|