Spaces:

skar0
/

cartpole-demo

Runtime error

App Files Files Community

skar0 commited on Mar 4, 2023

Commit

820bb68

1 Parent(s): 52c188e

Initial commit

Browse files

Files changed (5) hide show

.gitignore +136 -0
README.md +3 -12
app.py +54 -0
cartpole.py +635 -0
requirements.txt +438 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,136 @@

+generate/
+videos/
+token.txt
+pat.txt
+*.ipynb
+runs/
+wandb/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

README.md CHANGED Viewed

@@ -1,13 +1,4 @@
----
-title: Cartpole Demo
-emoji: 🔥
-colorFrom: blue
-colorTo: yellow
-sdk: gradio
-sdk_version: 3.19.1
-app_file: app.py
-pinned: false
-license: wtfpl
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# cartpole-demo
+This project is intended to publish my solution to the Cartpole environment from OpenAI's Gym.
+I want to deploy to HuggingFace

app.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import glob
+import gradio as gr
+import gym
+import sys
+from torch.utils.tensorboard import SummaryWriter
+import yaml
+import torch
+from cartpole import (
+    make_env, reset_env, Agent, rollout_phase, get_action_shape
+)
+MAIN = __name__ == "__main__"
+examples = [0, 1, 31415, 'Hello, World!', 'This is a seed...']
+def generate_video(
+    string: str, wandb_path='wandb/run-20230303_211416-ox4d1p0u/files'
+):
+    with open(f'{wandb_path}/config.yaml') as f_cfg:
+        config = yaml.safe_load(f_cfg)
+    seed = hash(string)  % ((sys.maxsize + 1) * 2)
+    num_envs = config['num_envs']['value']
+    num_steps = config['num_steps']['value']
+    assert seed >= 0
+    assert isinstance(seed, int)
+    run_name = f'seed{seed}'
+    log_dir = f'generate/{run_name}'
+    writer = SummaryWriter(log_dir)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    envs = gym.vector.SyncVectorEnv([
+        make_env("CartPole-v1", seed, i, True, run_name)
+        for i in range(num_envs)
+    ])
+    action_shape = get_action_shape(envs)
+    next_obs, next_done = reset_env(envs, device)
+    global_step = 0
+    agent = Agent(envs).to(device)
+    agent.load_state_dict(torch.load(f'{wandb_path}/model_state_dict.pt'))
+    rollout_phase(
+        next_obs, next_done, agent, envs, writer, device,
+        global_step, action_shape, num_envs, num_steps,
+    )
+    video_path = glob.glob(f'videos/{run_name}/*.mp4')[0]
+    return video_path
+if MAIN:
+    demo = gr.Interface(
+        fn=generate_video,
+        inputs=[
+            gr.components.Textbox(lines=1, label="Seed"),
+        ],
+        outputs=gr.components.Video(label="Generated Video"),
+        examples=examples,
+    )
+    demo.launch(share=True)

cartpole.py ADDED Viewed

	@@ -0,0 +1,635 @@

+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: light
+#       format_version: '1.5'
+#       jupytext_version: 1.14.5
+#   kernelspec:
+#     display_name: Python 3
+#     name: python3
+# ---
+# + id="QAY_RQOLcRtA" executionInfo={"status": "ok", "timestamp": 1677942285188, "user_tz": 0, "elapsed": 1942, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}} colab={"base_uri": "https://localhost:8080/"} outputId="ee4de327-947e-4f4e-9d34-514460da288a"
+MAIN = __name__ == "__main__"
+if MAIN:
+    print('Mounting drive...')
+    from google.colab import drive
+    drive.mount('/content/drive')
+# %cd /content/drive/MyDrive/Colab Notebooks/cartpole-demo
+# + colab={"base_uri": "https://localhost:8080/"} id="GgSNZRJh4EjV" executionInfo={"status": "ok", "timestamp": 1677942324397, "user_tz": 0, "elapsed": 39212, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}} outputId="8fd1eecc-12d1-4bae-cd15-dd541f1d84c7"
+# !pip install einops
+# !pip install wandb
+# !pip install jupytext
+# !pip install pygame
+# !pip install torchtyping
+# !pip install gradio
+# + colab={"base_uri": "https://localhost:8080/"} id="1g58HZUb8Ltl" executionInfo={"status": "ok", "timestamp": 1677942492332, "user_tz": 0, "elapsed": 2440, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}} outputId="d2f2ab57-c2c0-49aa-fdef-323556a2e4b6"
+# !git config --global user.email "[email protected]"
+# !git config --global user.name "ojh31"
+# !cat pat.txt | xargs git remote set-url origin
+# !jupytext --to py cartpole.ipynb
+# !git fetch
+# !git status
+# + id="vEczQ48wC40O" executionInfo={"status": "ok", "timestamp": 1677942330521, "user_tz": 0, "elapsed": 4062, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+import os
+import glob
+import sys
+import argparse
+import random
+import time
+from distutils.util import strtobool
+from dataclasses import dataclass
+from typing import Optional
+import numpy as np
+import torch
+import torch as t
+from torchtyping import TensorType as TT
+from typeguard import typechecked
+import gym
+import torch.nn as nn
+import torch.optim as optim
+from torch.distributions.categorical import Categorical
+from torch.utils.tensorboard import SummaryWriter
+from gym.spaces import Discrete
+from typing import Any, List, Optional, Union, Tuple, Iterable
+from einops import rearrange
+import importlib
+import wandb
+from typeguard import typechecked
+# + id="K7T8bs1Y76ZK" executionInfo={"status": "ok", "timestamp": 1677942330521, "user_tz": 0, "elapsed": 8, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}} colab={"base_uri": "https://localhost:8080/"} outputId="f59ffef0-7156-4f27-d992-a392d59a1c73"
+# %env "WANDB_NOTEBOOK_NAME" "cartpole.py"
+# + id="Q5E93-BGRjuy" executionInfo={"status": "ok", "timestamp": 1677942330522, "user_tz": 0, "elapsed": 8, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+def make_env(
+    env_id: str, seed: int, idx: int, capture_video: bool, run_name: str
+):
+    """
+    Return a function that returns an environment after setting up boilerplate.
+    """
+    def thunk():
+        env = gym.make(env_id, new_step_api=True)
+        env = gym.wrappers.RecordEpisodeStatistics(env)
+        if capture_video:
+            if idx == 0:
+                # Video every 50 runs for env #1
+                env = gym.wrappers.RecordVideo(
+                    env,
+                    f"videos/{run_name}",
+                    episode_trigger=lambda x : x % 50 == 0
+                )
+        obs = env.reset(seed=seed)
+        env.action_space.seed(seed)
+        env.observation_space.seed(seed)
+        return env
+    return thunk
+# + id="Kf152ROwHjM_" executionInfo={"status": "ok", "timestamp": 1677942330522, "user_tz": 0, "elapsed": 7, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+def test_minibatch_indexes(minibatch_indexes):
+    for n in range(5):
+        frac, minibatch_size = np.random.randint(1, 8, size=(2,))
+        batch_size = frac * minibatch_size
+        indices = minibatch_indexes(batch_size, minibatch_size)
+        assert any([isinstance(indices, list), isinstance(indices, np.ndarray)])
+        assert isinstance(indices[0], np.ndarray)
+        assert len(indices) == frac
+        np.testing.assert_equal(np.sort(np.stack(indices).flatten()), np.arange(batch_size))
+# + id="mhvduVeOHkln" executionInfo={"status": "ok", "timestamp": 1677942330522, "user_tz": 0, "elapsed": 7, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+def test_calc_entropy_bonus(calc_entropy_bonus):
+    probs = Categorical(logits=t.randn((3, 4)))
+    ent_coef = 0.5
+    expected = ent_coef * probs.entropy().mean()
+    actual = calc_entropy_bonus(probs, ent_coef)
+    t.testing.assert_close(expected, actual)
+# + id="Aya60GeCGA5X" executionInfo={"status": "ok", "timestamp": 1677942330875, "user_tz": 0, "elapsed": 360, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
+    t.nn.init.orthogonal_(layer.weight, std)
+    t.nn.init.constant_(layer.bias, bias_const)
+    return layer
+class Agent(nn.Module):
+    critic: nn.Sequential
+    actor: nn.Sequential
+    def __init__(self, envs: gym.vector.SyncVectorEnv):
+        super().__init__()
+        obs_shape = np.array(
+            (envs.num_envs, ) + envs.single_action_space.shape
+        ).prod().astype(int)
+        self.actor = nn.Sequential(
+            layer_init(nn.Linear(obs_shape, 64)),
+            nn.Tanh(),
+            layer_init(nn.Linear(64, 64)),
+            nn.Tanh(),
+            layer_init(nn.Linear(64, envs.single_action_space.n), std=.01),
+        )
+        self.critic = nn.Sequential(
+            layer_init(nn.Linear(obs_shape, 64)),
+            nn.Tanh(),
+            layer_init(nn.Linear(64, 64)),
+            nn.Tanh(),
+            layer_init(nn.Linear(64, 1), std=1),
+        )
+# + id="6PwPZHlLGDYu" executionInfo={"status": "ok", "timestamp": 1677942330875, "user_tz": 0, "elapsed": 4, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+# %%
+@t.inference_mode()
+def compute_advantages(
+    next_value: t.Tensor,
+    next_done: t.Tensor,
+    rewards: t.Tensor,
+    values: t.Tensor,
+    dones: t.Tensor,
+    device: t.device,
+    gamma: float,
+    gae_lambda: float,
+) -> t.Tensor:
+    '''Compute advantages using Generalized Advantage Estimation.
+    next_value: shape (1, env) -
+        represents V(s_{t+1}) which is needed for the last advantage term
+    next_done: shape (env,)
+    rewards: shape (t, env)
+    values: shape (t, env)
+    dones: shape (t, env)
+    Return: shape (t, env)
+    '''
+    assert isinstance(next_value, t.Tensor)
+    assert isinstance(next_done, t.Tensor)
+    assert isinstance(rewards, t.Tensor)
+    assert isinstance(values, t.Tensor)
+    assert isinstance(dones, t.Tensor)
+    t_max, n_env = values.shape
+    next_values = t.concat((values[1:, ], next_value))
+    next_dones = t.concat((dones[1:, ], next_done.unsqueeze(0)))
+    deltas = rewards + gamma * next_values * (1.0 - next_dones) - values
+    adv = deltas.clone().to(device)
+    for to_go in range(1, t_max):
+        t_idx = t_max - to_go - 1
+        t.testing.assert_close(adv[t_idx], deltas[t_idx])
+        adv[t_idx] += (
+            gamma * gae_lambda * adv[t_idx + 1] * (1.0 - next_dones[t_idx])
+        )
+    return adv
+# + id="uYSSMnF-GPvm" executionInfo={"status": "ok", "timestamp": 1677942330875, "user_tz": 0, "elapsed": 3, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+# %%
+@dataclass
+class Minibatch:
+    obs: t.Tensor
+    logprobs: t.Tensor
+    actions: t.Tensor
+    advantages: t.Tensor
+    returns: t.Tensor
+    values: t.Tensor
+def minibatch_indexes(
+    batch_size: int, minibatch_size: int
+) -> List[np.ndarray]:
+    '''
+    Return a list of length (batch_size // minibatch_size) where
+    each element is an array of indexes into the batch.
+    Each index should appear exactly once.
+    '''
+    assert batch_size % minibatch_size == 0
+    n = batch_size // minibatch_size
+    indices = np.arange(batch_size)
+    np.random.shuffle(indices)
+    return [indices[i::n] for i in range(n)]
+if MAIN:
+    test_minibatch_indexes(minibatch_indexes)
+def make_minibatches(
+    obs: t.Tensor,
+    logprobs: t.Tensor,
+    actions: t.Tensor,
+    advantages: t.Tensor,
+    values: t.Tensor,
+    obs_shape: tuple,
+    action_shape: tuple,
+    batch_size: int,
+    minibatch_size: int,
+) -> List[Minibatch]:
+    '''
+    Flatten the environment and steps dimension into one batch dimension,
+    then shuffle and split into minibatches.
+    '''
+    n_steps, n_env = values.shape
+    n_dim = n_steps * n_env
+    indexes = minibatch_indexes(batch_size=batch_size, minibatch_size=minibatch_size)
+    obs_flat = obs.reshape((batch_size,) + obs_shape)
+    act_flat = actions.reshape((batch_size,) + action_shape)
+    probs_flat = logprobs.reshape((batch_size,) + action_shape)
+    adv_flat = advantages.reshape(n_dim)
+    val_flat = values.reshape(n_dim)
+    return [
+        Minibatch(
+            obs_flat[idx], probs_flat[idx], act_flat[idx], adv_flat[idx],
+            adv_flat[idx] + val_flat[idx], val_flat[idx]
+        )
+        for idx in indexes
+    ]
+# + id="K7wXDJ9MGOWu" executionInfo={"status": "ok", "timestamp": 1677942330876, "user_tz": 0, "elapsed": 4, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+# %%
+def calc_policy_loss(
+    probs: Categorical, mb_action: t.Tensor, mb_advantages: t.Tensor,
+    mb_logprobs: t.Tensor, clip_coef: float
+) -> t.Tensor:
+    '''
+    Return the policy loss, suitable for maximisation with gradient ascent.
+    probs:
+        a distribution containing the actor's unnormalized logits of
+        shape (minibatch, num_actions)
+    clip_coef: amount of clipping, denoted by epsilon in Eq 7.
+    normalize: if true, normalize mb_advantages to have mean 0, variance 1
+    '''
+    adv_norm = (mb_advantages - mb_advantages.mean()) / mb_advantages.std()
+    ratio = t.exp(probs.log_prob(mb_action)) / t.exp(mb_logprobs)
+    min_left = ratio * adv_norm
+    min_right = t.clip(ratio, 1 - clip_coef, 1 + clip_coef) * adv_norm
+    return t.minimum(min_left, min_right).mean()
+# + id="CmyxU6JWGMsG" executionInfo={"status": "ok", "timestamp": 1677942330876, "user_tz": 0, "elapsed": 4, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+# %%
+def calc_value_function_loss(
+    critic: nn.Sequential, mb_obs: t.Tensor, mb_returns: t.Tensor, v_coef: float
+) -> t.Tensor:
+    '''Compute the value function portion of the loss function.
+    Need to minimise this
+    v_coef:
+        the coefficient for the value loss, which weights its contribution to
+        the overall loss. Denoted by c_1 in the paper.
+    '''
+    output = critic(mb_obs)
+    return v_coef * (output - mb_returns).pow(2).mean() / 2
+# + id="npyWs6xjGLkP" executionInfo={"status": "ok", "timestamp": 1677942331469, "user_tz": 0, "elapsed": 597, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+# %%
+def calc_entropy_loss(probs: Categorical, ent_coef: float):
+    '''Return the entropy loss term.
+    Need to maximise this
+    ent_coef:
+        The coefficient for the entropy loss, which weights its contribution to the overall loss.
+        Denoted by c_2 in the paper.
+    '''
+    return probs.entropy().mean() * ent_coef
+if MAIN:
+    test_calc_entropy_bonus(calc_entropy_loss)
+# + id="nqJeg1kZGKSG" executionInfo={"status": "ok", "timestamp": 1677942331470, "user_tz": 0, "elapsed": 5, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+# %%
+class PPOScheduler:
+    def __init__(self, optimizer: optim.Adam, initial_lr: float, end_lr: float, num_updates: int):
+        self.optimizer = optimizer
+        self.initial_lr = initial_lr
+        self.end_lr = end_lr
+        self.num_updates = num_updates
+        self.n_step_calls = 0
+    def step(self):
+        '''
+        Implement linear learning rate decay so that after num_updates calls to step,
+        the learning rate is end_lr.
+        '''
+        lr = (
+            self.initial_lr +
+            (self.end_lr - self.initial_lr) * self.n_step_calls / self.num_updates
+        )
+        for param in self.optimizer.param_groups:
+            param['lr'] = lr
+        self.n_step_calls += 1
+def make_optimizer(
+    agent: Agent, num_updates: int, initial_lr: float, end_lr: float
+) -> Tuple[optim.Adam, PPOScheduler]:
+    '''Return an appropriately configured Adam with its attached scheduler.'''
+    optimizer = optim.Adam(agent.parameters(), lr=initial_lr, maximize=True)
+    scheduler = PPOScheduler(
+        optimizer=optimizer, initial_lr=initial_lr, end_lr=end_lr, num_updates=num_updates
+    )
+    return optimizer, scheduler
+# + id="mgZ7-wsRCxJW" executionInfo={"status": "ok", "timestamp": 1677942331470, "user_tz": 0, "elapsed": 5, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+@dataclass
+class PPOArgs:
+    exp_name: str = 'cartpole.py'
+    seed: int = 1
+    torch_deterministic: bool = True
+    cuda: bool = True
+    track: bool = True
+    wandb_project_name: str = "PPOCart"
+    wandb_entity: str = None
+    capture_video: bool = True
+    env_id: str = "CartPole-v1"
+    total_timesteps: int = 40_000
+    learning_rate: float = 0.00025
+    num_envs: int = 4
+    num_steps: int = 128
+    gamma: float = 0.99
+    gae_lambda: float = 0.95
+    num_minibatches: int = 4
+    update_epochs: int = 4
+    clip_coef: float = 0.2
+    ent_coef: float = 0.01
+    vf_coef: float = 0.5
+    max_grad_norm: float = 0.5
+    batch_size: int = 512
+    minibatch_size: int = 128
+# + id="xeIu-J3ZwGyq" executionInfo={"status": "ok", "timestamp": 1677942356492, "user_tz": 0, "elapsed": 218, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+def wandb_init(name: str, args: PPOArgs):
+    wandb.init(
+        project=args.wandb_project_name,
+        entity=args.wandb_entity,
+        sync_tensorboard=True,
+        config=vars(args),
+        name=name,
+        monitor_gym=True,
+        save_code=True,
+        settings=wandb.Settings(symlink=False)
+    )
+# + id="gMYWqhsryYHy" executionInfo={"status": "ok", "timestamp": 1677942331470, "user_tz": 0, "elapsed": 4, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+# + id="T9j_L0Wpyrgz" executionInfo={"status": "ok", "timestamp": 1677942331471, "user_tz": 0, "elapsed": 5, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+@typechecked
+def rollout_phase(
+    next_obs: t.Tensor, next_done: t.Tensor,
+    agent: Agent, envs: gym.vector.SyncVectorEnv,
+    writer: SummaryWriter, device: torch.device,
+    global_step: int,  action_shape: Tuple,
+    num_envs: int, num_steps: int,
+) -> Tuple[
+    TT['envs'],
+    TT['envs'],
+    TT['steps', 'envs'],
+    TT['steps', 'envs'],
+    TT['steps', 'envs'],
+    TT['steps', 'envs'],
+    TT['steps', 'envs'],
+    TT['steps', 'envs'],
+]:
+    '''
+    Output:
+    next_obs, next_done, actions, dones, logprobs, obs, rewards, values
+    '''
+    obs = torch.zeros(
+        (num_steps, num_envs) +
+        envs.single_observation_space.shape
+    ).to(device)
+    actions = torch.zeros(
+        (num_steps, num_envs) +
+        action_shape
+    ).to(device)
+    logprobs = torch.zeros((num_steps, num_envs)).to(device)
+    rewards = torch.zeros((num_steps, num_envs)).to(device)
+    dones = torch.zeros((num_steps, num_envs)).to(device)
+    values = torch.zeros((num_steps, num_envs)).to(device)
+    for i in range(0, num_steps):
+        # Rollout phase
+        global_step += 1
+        curr_obs = next_obs
+        done = next_done
+        with t.inference_mode():
+            logits = agent.actor(curr_obs).detach()
+            q_values = agent.critic(curr_obs).detach().squeeze(-1)
+        prob = Categorical(logits=logits)
+        action = prob.sample()
+        logprob = prob.log_prob(action)
+        next_obs, reward, next_done, info = envs.step(action.numpy())
+        next_obs = t.tensor(next_obs, device=device)
+        next_done = t.tensor(next_done, device=device)
+        actions[i] = action
+        dones[i] = done.detach().clone()
+        logprobs[i] = logprob
+        obs[i] = curr_obs
+        rewards[i] = t.tensor(reward, device=device)
+        values[i] = q_values
+        if writer is not None and "episode" in info.keys():
+            for item in info['episode']:
+                if item is None or 'r' not in item.keys():
+                    continue
+                writer.add_scalar(
+                    "charts/episodic_return", item["r"], global_step
+                )
+                writer.add_scalar(
+                    "charts/episodic_length", item["l"], global_step
+                )
+                if global_step % 10 != 0:
+                    continue
+                print(
+                    f"global_step={global_step}, episodic_return={item['r']}"
+                )
+                print("charts/episodic_return", item["r"], global_step)
+                print("charts/episodic_length", item["l"], global_step)
+    return (
+        next_obs, next_done, actions, dones, logprobs, obs, rewards, values
+    )
+# + id="xdDhABIk5jyb" executionInfo={"status": "ok", "timestamp": 1677942331471, "user_tz": 0, "elapsed": 5, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+def reset_env(envs, device):
+    next_obs = torch.Tensor(envs.reset()).to(device)
+    next_done = torch.zeros(envs.num_envs).to(device)
+    return next_obs, next_done
+# + id="5CoMpUVU7rFT" executionInfo={"status": "ok", "timestamp": 1677942331471, "user_tz": 0, "elapsed": 5, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+def get_action_shape(envs: gym.vector.SyncVectorEnv):
+    action_shape = envs.single_action_space.shape
+    assert action_shape is not None
+    assert isinstance(
+        envs.single_action_space, Discrete
+    ), "only discrete action space is supported"
+    return action_shape
+# + id="FHmn5kSUGFFu" executionInfo={"status": "ok", "timestamp": 1677942366007, "user_tz": 0, "elapsed": 251, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}
+# %%
+def train_ppo(args: PPOArgs):
+    t0 = int(time.time())
+    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{t0}"
+    if args.track:
+        wandb_init(run_name, args)
+    log_dir = wandb.run.dir
+    writer = SummaryWriter(log_dir)
+    writer.add_text(
+        "hyperparameters",
+        "|param|value|\n|-|-|\n%s" % "\n".join([f"|{key}|{value}|"
+        for (key, value) in vars(args).items()]),
+    )
+    set_seed(args.seed)
+    torch.backends.cudnn.deterministic = args.torch_deterministic
+    device = torch.device(
+        "cuda" if torch.cuda.is_available() and args.cuda else "cpu"
+    )
+    envs = gym.vector.SyncVectorEnv([
+        make_env(args.env_id, args.seed + i, i, args.capture_video, run_name)
+        for i in range(args.num_envs)
+    ])
+    agent = Agent(envs).to(device)
+    num_updates = args.total_timesteps // args.batch_size
+    (optimizer, scheduler) = make_optimizer(
+        agent, num_updates, args.learning_rate, 0.0
+    )
+    global_step = 0
+    old_approx_kl = 0.0
+    approx_kl = 0.0
+    value_loss = t.tensor(0.0)
+    policy_loss = t.tensor(0.0)
+    entropy_loss = t.tensor(0.0)
+    clipfracs = []
+    info = []
+    action_shape = get_action_shape(envs)
+    next_obs, next_done = reset_env(envs, device)
+    start_time = time.time()
+    for _ in range(num_updates):
+        rp = rollout_phase(
+            next_obs, next_done, agent, envs, writer, device, global_step,
+            action_shape, args.num_envs, args.num_steps,
+        )
+        next_obs, next_done, actions, dones, logprobs, obs, rewards, values = rp
+        with t.inference_mode():
+            next_value = rearrange(agent.critic(next_obs), "env 1 -> 1 env")
+        advantages = compute_advantages(
+            next_value, next_done, rewards, values, dones, device, args.gamma,
+            args.gae_lambda
+        )
+        clipfracs.clear()
+        mb: Minibatch
+        for _ in range(args.update_epochs):
+            minibatches = make_minibatches(
+                obs,
+                logprobs,
+                actions,
+                advantages,
+                values,
+                envs.single_observation_space.shape,
+                action_shape,
+                args.batch_size,
+                args.minibatch_size,
+            )
+            for mb in minibatches:
+                probs = Categorical(logits=agent.actor(mb.obs))
+                value_loss = calc_value_function_loss(
+                    agent.critic, mb.obs, mb.returns, args.vf_coef
+                )
+                policy_loss = calc_policy_loss(
+                    probs, mb.actions, mb.advantages, mb.logprobs,
+                    args.clip_coef
+                )
+                entropy_loss = calc_entropy_loss(probs, args.ent_coef)
+                loss = policy_loss + entropy_loss - value_loss
+                loss.backward()
+                nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
+                optimizer.step()
+                optimizer.zero_grad()
+        scheduler.step()
+        (y_pred, y_true) = (mb.values.cpu().numpy(), mb.returns.cpu().numpy())
+        var_y = np.var(y_true)
+        explained_var = (
+            np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
+        )
+        with torch.no_grad():
+            newlogprob: t.Tensor = probs.log_prob(mb.actions)
+            logratio = newlogprob - mb.logprobs
+            ratio = logratio.exp()
+            old_approx_kl = (-logratio).mean().item()
+            approx_kl = (ratio - 1 - logratio).mean().item()
+            clipfracs += [
+                ((ratio - 1.0).abs() > args.clip_coef).float().mean().item()
+            ]
+        writer.add_scalar(
+            "charts/learning_rate", optimizer.param_groups[0]["lr"],
+            global_step
+        )
+        writer.add_scalar("losses/value_loss", value_loss.item(), global_step)
+        writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step)
+        writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
+        writer.add_scalar("losses/old_approx_kl", old_approx_kl, global_step)
+        writer.add_scalar("losses/approx_kl", approx_kl, global_step)
+        writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
+        writer.add_scalar(
+            "losses/explained_variance", explained_var, global_step
+        )
+        writer.add_scalar(
+            "charts/SPS",
+            int(global_step / (time.time() - start_time)),
+            global_step
+        )
+        if global_step % 1000 == 0:
+            print(
+                "steps per second (SPS):",
+                int(global_step / (time.time() - start_time))
+            )
+            print("losses/value_loss", value_loss.item())
+            print("losses/policy_loss", policy_loss.item())
+            print("losses/entropy", entropy_loss.item())
+    print(f'... training complete after {global_step} steps')
+    envs.close()
+    writer.close()
+    if args.track:
+        model_path = f'{wandb.run.dir}/model_state_dict.pt'
+        print(f'Saving model to {model_path}')
+        t.save(agent.state_dict(), model_path)
+        wandb.finish()
+        print('...wandb finished.')
+# + id="-oZHTffJZP17" executionInfo={"status": "ok", "timestamp": 1677942433344, "user_tz": 0, "elapsed": 66678, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}} colab={"base_uri": "https://localhost:8080/", "height": 1000, "referenced_widgets": ["c966d31ee30d43e0a8cc269a8a22b717", "294a378e56c44e4c9a3c58e8bf5b5f62", "473cc94ea22746f3a51e2186d973f741", "e3bb8c5a2c3841c2b33a7b8afb66a88f", "6133d8cbba964b7e8755e1c0691caf27", "1bf18f5fae9c4f58b2e360bc35251a94", "e820d38826494e248ca8974cccc1f338", "05eebe964b4b4c93b4aa0eac9ff865cb"]} outputId="0cfbb11c-831a-4622-8c01-afebae209d04"
+# #%%wandb
+if MAIN:
+    args = PPOArgs()
+    train_ppo(args)
+# + colab={"base_uri": "https://localhost:8080/"} id="xJW6KL7QIj4s" outputId="7c529849-6d46-4a6a-def5-e1c0ef652c64"
+# !python demo.py
+# + id="P7ZfUlAqImIr" executionInfo={"status": "aborted", "timestamp": 1677942332655, "user_tz": 0, "elapsed": 4, "user": {"displayName": "Oskar Hollinsworth", "userId": "00307706571197304608"}}

requirements.txt ADDED Viewed

	@@ -0,0 +1,438 @@

+absl-py==1.4.0
+aeppl==0.0.33
+aesara==2.7.9
+aiofiles==23.1.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+alabaster==0.7.13
+albumentations==1.2.1
+altair==4.2.2
+anyio==3.6.2
+appdirs==1.4.4
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+arviz==0.12.1
+astor==0.8.1
+astropy==4.3.1
+astunparse==1.6.3
+async-timeout==4.0.2
+atomicwrites==1.4.1
+attrs==22.2.0
+audioread==3.0.0
+autograd==1.5
+Babel==2.12.1
+backcall==0.2.0
+backports.zoneinfo==0.2.1
+beautifulsoup4==4.6.3
+bleach==6.0.0
+blis==0.7.9
+bokeh==2.4.3
+branca==0.6.0
+bs4==0.0.1
+CacheControl==0.12.11
+cachetools==5.3.0
+catalogue==2.0.8
+certifi==2022.12.7
+cffi==1.15.1
+cftime==1.6.2
+chardet==4.0.0
+charset-normalizer==3.0.1
+click==8.1.3
+clikit==0.6.2
+cloudpickle==2.2.1
+cmake==3.22.6
+cmdstanpy==1.1.0
+colorcet==3.0.1
+colorlover==0.3.0
+community==1.0.0b1
+confection==0.0.4
+cons==0.4.5
+contextlib2==0.5.5
+convertdate==2.4.0
+crashtest==0.3.1
+crcmod==1.7
+cufflinks==0.17.3
+cvxopt==1.3.0
+cvxpy==1.2.3
+cycler==0.11.0
+cymem==2.0.7
+Cython==0.29.33
+dask==2022.2.1
+datascience==0.17.6
+db-dtypes==1.0.5
+dbus-python==1.2.16
+debugpy==1.6.4
+decorator==4.4.2
+defusedxml==0.7.1
+distributed==2022.2.1
+dlib==19.24.0
+dm-tree==0.1.8
+dnspython==2.3.0
+docker-pycreds==0.4.0
+docutils==0.16
+dopamine-rl==1.0.5
+earthengine-api==0.1.342
+easydict==1.10
+ecos==2.0.12
+editdistance==0.5.3
+einops==0.6.0
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl
+entrypoints==0.4
+ephem==4.1.4
+et-xmlfile==1.1.0
+etils==1.0.0
+etuples==0.3.8
+fa2==0.3.5
+fastai==2.7.11
+fastapi==0.92.0
+fastcore==1.5.28
+fastdownload==0.0.7
+fastdtw==0.3.4
+fastjsonschema==2.16.3
+fastprogress==1.0.3
+fastrlock==0.8.1
+feather-format==0.4.1
+ffmpy==0.3.0
+filelock==3.9.0
+firebase-admin==5.3.0
+fix-yahoo-finance==0.0.22
+Flask==2.2.3
+flatbuffers==23.1.21
+folium==0.12.1.post1
+fonttools==4.38.0
+frozenlist==1.3.3
+fsspec==2023.1.0
+future==0.16.0
+gast==0.4.0
+GDAL==3.3.2
+gdown==4.4.0
+gensim==3.6.0
+geographiclib==1.52
+geopy==1.17.0
+gin-config==0.5.0
+gitdb==4.0.10
+GitPython==3.1.31
+glob2==0.7
+google==2.0.3
+google-api-core==2.11.0
+google-api-python-client==2.70.0
+google-auth==2.16.1
+google-auth-httplib2==0.1.0
+google-auth-oauthlib==0.4.6
+google-cloud-bigquery==3.4.2
+google-cloud-bigquery-storage==2.18.1
+google-cloud-core==2.3.2
+google-cloud-datastore==2.11.1
+google-cloud-firestore==2.7.3
+google-cloud-language==2.6.1
+google-cloud-storage==2.7.0
+google-cloud-translate==3.8.4
+google-colab @ file:///colabtools/dist/google-colab-1.0.0.tar.gz
+google-crc32c==1.5.0
+google-pasta==0.2.0
+google-resumable-media==2.4.1
+googleapis-common-protos==1.58.0
+googledrivedownloader==0.4
+gradio==3.20.0
+graphviz==0.10.1
+greenlet==2.0.2
+grpcio==1.51.3
+grpcio-status==1.48.2
+gspread==3.4.2
+gspread-dataframe==3.0.8
+gym==0.25.2
+gym-notices==0.0.8
+h11==0.14.0
+h5py==3.1.0
+HeapDict==1.0.1
+hijri-converter==2.2.4
+holidays==0.20
+holoviews==1.14.9
+html5lib==1.0.1
+httpcore==0.16.3
+httpimport==0.5.18
+httplib2==0.17.4
+httpstan==4.6.1
+httpx==0.23.3
+humanize==0.5.1
+hyperopt==0.1.2
+idna==2.10
+imageio==2.9.0
+imagesize==1.4.1
+imbalanced-learn==0.8.1
+imblearn==0.0
+imgaug==0.4.0
+importlib-metadata==6.0.0
+importlib-resources==5.12.0
+imutils==0.5.4
+inflect==2.1.0
+intel-openmp==2023.0.0
+ipykernel==5.3.4
+ipython==7.9.0
+ipython-genutils==0.2.0
+ipython-sql==0.3.9
+ipywidgets==7.7.1
+itsdangerous==2.1.2
+jax==0.4.4
+jaxlib @ https://storage.googleapis.com/jax-releases/cuda11/jaxlib-0.4.4+cuda11.cudnn82-cp38-cp38-manylinux2014_x86_64.whl
+jieba==0.42.1
+Jinja2==3.1.2
+joblib==1.2.0
+jsonschema==4.3.3
+jupyter-client==6.1.12
+jupyter-console==6.1.0
+jupyter_core==5.2.0
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==3.0.5
+jupytext==1.14.5
+kaggle==1.5.12
+keras==2.11.0
+keras-vis==0.4.1
+kiwisolver==1.4.4
+korean-lunar-calendar==0.3.1
+langcodes==3.3.0
+libclang==15.0.6.1
+librosa==0.8.1
+lightgbm==2.2.3
+linkify-it-py==2.0.0
+llvmlite==0.39.1
+lmdb==0.99
+locket==1.0.0
+logical-unification==0.4.5
+LunarCalendar==0.0.9
+lxml==4.9.2
+Markdown==3.4.1
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+marshmallow==3.19.0
+matplotlib==3.5.3
+matplotlib-venn==0.11.9
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+miniKanren==1.0.3
+missingno==0.5.2
+mistune==0.8.4
+mizani==0.8.1
+mkl==2019.0
+mlxtend==0.14.0
+more-itertools==9.1.0
+moviepy==0.2.3.5
+mpmath==1.2.1
+msgpack==1.0.4
+multidict==6.0.4
+multipledispatch==0.6.0
+multitasking==0.0.11
+murmurhash==1.0.9
+music21==5.5.0
+natsort==5.5.0
+nbclient==0.7.2
+nbconvert==6.5.4
+nbformat==5.7.3
+netCDF4==1.6.2
+networkx==3.0
+nibabel==3.0.2
+nltk==3.7
+notebook==6.3.0
+numba==0.56.4
+numexpr==2.8.4
+numpy==1.22.4
+oauth2client==4.1.3
+oauthlib==3.2.2
+opencv-contrib-python==4.6.0.66
+opencv-python==4.6.0.66
+opencv-python-headless==4.7.0.72
+openpyxl==3.0.10
+opt-einsum==3.3.0
+orjson==3.8.7
+osqp==0.6.2.post0
+packaging==23.0
+palettable==3.3.0
+pandas==1.3.5
+pandas-datareader==0.9.0
+pandas-gbq==0.17.9
+pandas-profiling==1.4.1
+pandocfilters==1.5.0
+panel==0.14.3
+param==1.12.3
+parso==0.8.3
+partd==1.3.0
+pastel==0.2.1
+pathlib==1.0.1
+pathtools==0.1.2
+pathy==0.10.1
+patsy==0.5.3
+pep517==0.13.0
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==8.4.0
+pip-tools==6.6.2
+platformdirs==3.0.0
+plotly==5.5.0
+plotnine==0.10.1
+pluggy==0.7.1
+pooch==1.7.0
+portpicker==1.3.9
+prefetch-generator==1.0.3
+preshed==3.0.8
+prettytable==3.6.0
+progressbar2==3.38.0
+prometheus-client==0.16.0
+promise==2.3
+prompt-toolkit==2.0.10
+prophet==1.1.2
+proto-plus==1.22.2
+protobuf==3.19.6
+psutil==5.4.8
+psycopg2==2.9.5
+ptyprocess==0.7.0
+py==1.11.0
+pyarrow==9.0.0
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycocotools==2.0.6
+pycparser==2.21
+pycryptodome==3.17
+pyct==0.5.0
+pydantic==1.10.5
+pydata-google-auth==1.7.0
+pydot==1.3.0
+pydot-ng==2.0.0
+pydotplus==2.0.2
+PyDrive==1.3.1
+pydub==0.25.1
+pyerfa==2.0.0.1
+pygame==2.2.0
+Pygments==2.6.1
+PyGObject==3.36.0
+pylev==1.4.0
+pymc==4.1.4
+PyMeeus==0.5.12
+pymongo==4.3.3
+pymystem3==0.2.0
+PyOpenGL==3.1.6
+pyparsing==3.0.9
+pyrsistent==0.19.3
+pysimdjson==3.2.0
+PySocks==1.7.1
+pystan==3.3.0
+pytest==3.6.4
+python-apt==2.0.1
+python-dateutil==2.8.2
+python-louvain==0.16
+python-multipart==0.0.6
+python-slugify==8.0.1
+python-utils==3.5.2
+pytz==2022.7.1
+pyviz-comms==2.2.1
+PyWavelets==1.4.1
+PyYAML==6.0
+pyzmq==23.2.1
+qdldl==0.1.5.post3
+qudida==0.0.4
+regex==2022.6.2
+requests==2.25.1
+requests-oauthlib==1.3.1
+requests-unixsocket==0.2.0
+resampy==0.4.2
+rfc3986==1.5.0
+rpy2==3.5.5
+rsa==4.9
+scikit-image==0.19.3
+scikit-learn==1.2.1
+scipy==1.10.1
+screen-resolution-extra==0.0.0
+scs==3.2.2
+seaborn==0.11.2
+Send2Trash==1.8.0
+sentry-sdk==1.16.0
+setproctitle==1.3.2
+shapely==2.0.1
+six==1.15.0
+sklearn-pandas==2.2.0
+smart-open==6.3.0
+smmap==5.0.0
+sniffio==1.3.0
+snowballstemmer==2.2.0
+sortedcontainers==2.4.0
+soundfile==0.12.1
+spacy==3.4.4
+spacy-legacy==3.0.12
+spacy-loggers==1.0.4
+Sphinx==3.5.4
+sphinxcontrib-applehelp==1.0.4
+sphinxcontrib-devhelp==1.0.2
+sphinxcontrib-htmlhelp==2.0.1
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.3
+sphinxcontrib-serializinghtml==1.1.5
+SQLAlchemy==1.4.46
+sqlparse==0.4.3
+srsly==2.4.6
+starlette==0.25.0
+statsmodels==0.13.5
+sympy==1.7.1
+tables==3.7.0
+tabulate==0.8.10
+tblib==1.7.0
+tenacity==8.2.2
+tensorboard==2.11.2
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorflow==2.11.0
+tensorflow-datasets==4.8.3
+tensorflow-estimator==2.11.0
+tensorflow-gcs-config==2.11.0
+tensorflow-hub==0.12.0
+tensorflow-io-gcs-filesystem==0.31.0
+tensorflow-metadata==1.12.0
+tensorflow-probability==0.19.0
+termcolor==2.2.0
+terminado==0.13.3
+text-unidecode==1.3
+textblob==0.15.3
+thinc==8.1.7
+threadpoolctl==3.1.0
+tifffile==2023.2.27
+tinycss2==1.2.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.0
+torch @ https://download.pytorch.org/whl/cu116/torch-1.13.1%2Bcu116-cp38-cp38-linux_x86_64.whl
+torchaudio @ https://download.pytorch.org/whl/cu116/torchaudio-0.13.1%2Bcu116-cp38-cp38-linux_x86_64.whl
+torchsummary==1.5.1
+torchtext==0.14.1
+torchtyping==0.1.4
+torchvision @ https://download.pytorch.org/whl/cu116/torchvision-0.14.1%2Bcu116-cp38-cp38-linux_x86_64.whl
+tornado==6.2
+tqdm==4.64.1
+traitlets==5.7.1
+tweepy==3.10.0
+typeguard==2.13.3
+typer==0.7.0
+typing_extensions==4.5.0
+tzlocal==1.5.1
+uc-micro-py==1.0.1
+uritemplate==4.1.1
+urllib3==1.26.14
+uvicorn==0.20.0
+vega-datasets==0.9.0
+wandb==0.13.10
+wasabi==0.10.1
+wcwidth==0.2.6
+webargs==8.2.0
+webencodings==0.5.1
+websockets==10.4
+Werkzeug==2.2.3
+widgetsnbextension==3.6.2
+wordcloud==1.8.2.2
+wrapt==1.15.0
+xarray==2022.12.0
+xarray-einstats==0.5.1
+xgboost==1.7.4
+xkit==0.0.0
+xlrd==1.2.0
+xlwt==1.3.0
+yarl==1.8.2
+yellowbrick==1.5
+zict==2.2.0
+zipp==3.15.0