|
|
|
|
|
#ifndef DLIB_LSPI_Hh_ |
|
#define DLIB_LSPI_Hh_ |
|
|
|
#include "lspi_abstract.h" |
|
#include "approximate_linear_models.h" |
|
|
|
namespace dlib |
|
{ |
|
|
|
|
|
|
|
template < |
|
typename feature_extractor |
|
> |
|
class lspi |
|
{ |
|
public: |
|
typedef feature_extractor feature_extractor_type; |
|
typedef typename feature_extractor::state_type state_type; |
|
typedef typename feature_extractor::action_type action_type; |
|
|
|
explicit lspi( |
|
const feature_extractor& fe_ |
|
) : fe(fe_) |
|
{ |
|
init(); |
|
} |
|
|
|
lspi( |
|
) |
|
{ |
|
init(); |
|
} |
|
|
|
double get_discount ( |
|
) const { return discount; } |
|
|
|
void set_discount ( |
|
double value |
|
) |
|
{ |
|
|
|
DLIB_ASSERT(0 < value && value <= 1, |
|
"\t void lspi::set_discount(value)" |
|
<< "\n\t invalid inputs were given to this function" |
|
<< "\n\t value: " << value |
|
); |
|
discount = value; |
|
} |
|
|
|
const feature_extractor& get_feature_extractor ( |
|
) const { return fe; } |
|
|
|
void be_verbose ( |
|
) |
|
{ |
|
verbose = true; |
|
} |
|
|
|
void be_quiet ( |
|
) |
|
{ |
|
verbose = false; |
|
} |
|
|
|
void set_epsilon ( |
|
double eps_ |
|
) |
|
{ |
|
|
|
DLIB_ASSERT(eps_ > 0, |
|
"\t void lspi::set_epsilon(eps_)" |
|
<< "\n\t invalid inputs were given to this function" |
|
<< "\n\t eps_: " << eps_ |
|
); |
|
eps = eps_; |
|
} |
|
|
|
double get_epsilon ( |
|
) const |
|
{ |
|
return eps; |
|
} |
|
|
|
void set_lambda ( |
|
double lambda_ |
|
) |
|
{ |
|
|
|
DLIB_ASSERT(lambda_ >= 0, |
|
"\t void lspi::set_lambda(lambda_)" |
|
<< "\n\t invalid inputs were given to this function" |
|
<< "\n\t lambda_: " << lambda_ |
|
); |
|
lambda = lambda_; |
|
} |
|
|
|
double get_lambda ( |
|
) const |
|
{ |
|
return lambda; |
|
} |
|
|
|
void set_max_iterations ( |
|
unsigned long max_iter |
|
) { max_iterations = max_iter; } |
|
|
|
unsigned long get_max_iterations ( |
|
) { return max_iterations; } |
|
|
|
template <typename vector_type> |
|
policy<feature_extractor> train ( |
|
const vector_type& samples |
|
) const |
|
{ |
|
|
|
DLIB_ASSERT(samples.size() > 0, |
|
"\t policy lspi::train(samples)" |
|
<< "\n\t invalid inputs were given to this function" |
|
); |
|
|
|
matrix<double,0,1> w(fe.num_features()); |
|
w = 0; |
|
matrix<double,0,1> prev_w, b, f1, f2; |
|
|
|
matrix<double> A; |
|
|
|
double change; |
|
unsigned long iter = 0; |
|
do |
|
{ |
|
A = identity_matrix<double>(fe.num_features())*lambda; |
|
b = 0; |
|
for (unsigned long i = 0; i < samples.size(); ++i) |
|
{ |
|
fe.get_features(samples[i].state, samples[i].action, f1); |
|
fe.get_features(samples[i].next_state, |
|
fe.find_best_action(samples[i].next_state,w), |
|
f2); |
|
A += f1*trans(f1 - discount*f2); |
|
b += f1*samples[i].reward; |
|
} |
|
|
|
prev_w = w; |
|
if (feature_extractor::force_last_weight_to_1) |
|
w = join_cols(pinv(colm(A,range(0,A.nc()-2)))*(b-colm(A,A.nc()-1)),mat(1.0)); |
|
else |
|
w = pinv(A)*b; |
|
|
|
change = length(w-prev_w); |
|
++iter; |
|
|
|
if (verbose) |
|
std::cout << "iteration: " << iter << "\tchange: " << change << std::endl; |
|
|
|
} while(change > eps && iter < max_iterations); |
|
|
|
return policy<feature_extractor>(w,fe); |
|
} |
|
|
|
|
|
private: |
|
|
|
void init() |
|
{ |
|
lambda = 0.01; |
|
discount = 0.8; |
|
eps = 0.01; |
|
verbose = false; |
|
max_iterations = 100; |
|
} |
|
|
|
double lambda; |
|
double discount; |
|
double eps; |
|
bool verbose; |
|
unsigned long max_iterations; |
|
feature_extractor fe; |
|
}; |
|
|
|
|
|
|
|
} |
|
|
|
#endif |
|
|
|
|