File size: 2,438 Bytes
18ddfe2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/bin/bash
# Launches training jobs.
# Modify this file to launch workers with your prefered cloud API.
# The following implementation runs each worker as a subprocess on the local
# machine.

MODELS_DIR="/tmp/models"

# Get command line options.
OPTS=$(getopt -n "$0" -o "" --long "job_name:,config:,num_workers:,num_ps:,max_npe:,num_repetitions:,stop_on_success:" -- "$@")
if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi

eval set -- "$OPTS"

JOB_NAME=""           # Name of the process and the logs directory.
CONFIG=""             # Model and environment hparams.
# NUM_WORKERS: Number of workers to launch for this training job. If using
# neural networks, each worker will be 1 replica.
NUM_WORKERS=1
# NUM_PS: Number of parameter servers to launch for this training job. Only set
# this if using neural networks. For 1 worker, no parameter servers are needed.
# For more than 1 worker, at least 1 parameter server is needed to store the
# global model.
NUM_PS=0
# MAX_NPE: Maximum number of programs executed. Training will quit once this
# threshold is reached. If 0, the threshold is infinite.
MAX_NPE=0
NUM_REPETITIONS=1     # How many times to run this experiment.
STOP_ON_SUCCESS=true  # Whether to halt training when a solution is found.

# Parse options into variables.
while true; do
  case "$1" in
    --job_name ) JOB_NAME="$2"; shift; shift ;;
    --config ) CONFIG="$2"; shift; shift ;;
    --num_workers ) NUM_WORKERS="$2"; shift; shift ;;
    --num_ps ) NUM_PS="$2"; shift; shift ;;
    --max_npe ) MAX_NPE="$2"; shift; shift ;;
    --num_repetitions ) NUM_REPETITIONS="$2"; shift; shift ;;
    --stop_on_success ) STOP_ON_SUCCESS="$2"; shift; shift ;;
    -- ) shift; break ;;
    * ) break ;;
  esac
done

# Launch jobs.
# TODO: multi-worker RL training

LOGDIR="$MODELS_DIR/$JOB_NAME"
mkdir -p $LOGDIR

BIN_DIR="bazel-bin/single_task"
for (( i=0; i<NUM_WORKERS; i++))
do
  # Expecting run.par to be built.
  $BIN_DIR/run.par \
      --alsologtostderr \
      --config="$CONFIG" \
      --logdir="$LOGDIR" \
      --max_npe="$MAX_NPE" \
      --num_repetitions="$NUM_REPETITIONS" \
      --stop_on_success="$STOP_ON_SUCCESS" \
      --task_id="$i" \
      --num_workers="$NUM_WORKERS" \
      --summary_tasks=1 \
      2> "$LOGDIR/task_$i.log" &  # Run as subprocess
  echo "Launched task $i. Logs: $LOGDIR/task_$i.log"
done


# Use "pidof run.par" to find jobs.
# Kill with "pkill run.par"