Table of Contents
Here, we introduce how to implement a simple maze task with SkyAI. The maze task has a discrete state and a discrete action, which will be implemented as a module of SkyAI. As an reinforcement learning algorithm, Peng's Q(lambda)-learning is applied to the maze task; of course, we use predefined modules.
The following is the procedure:
The sample code works on a console; no extra libraries are required. Let's start!
The maze has the size W x H, consisting of the start (S), the goal (G), and the walls. The robot cannot go through the walls. Its objective is to move from the start to the goal in the shortest path. This is an example of the maze environment:
# # # # # # # # # # # # G # # # # # # S # # # # # # # # # # # # # # # # # # # # # # # # #
The state is a 1-dimensional discrete value where the (x, y) position is serialized. The action is a discrete action consisting of {up,down,left,right}. The reward is given +1 when the robot arrives at the goal, and -0.01 for each action.
Please refer to ../Tutorial - Making Module.
int NumEpisodes; // number of episodes int MaxSteps; // number of max action steps per episode int StartX, StartY; // start position double GoalReward; // goal reward double StepCost; // cost for each action step int SleepUTime; // duration for display std::vector<std::vector<int> > Map; // Map[y][x], 0:free space, 1:wall, 2:goal, every element should have the same size
TMazeTaskConfigurations (var_space::TVariableMap &mmap) : NumEpisodes (1000), MaxSteps (1000), StartX (1), StartY (1), GoalReward (1.0), StepCost (-0.01), SleepUTime (1000) { Register(mmap); }
ADD( NumEpisodes ); ADD( StartX ); ADD( StartY ); ADD( GoalReward ); ADD( StepCost ); ADD( SleepUTime ); ADD( Map );
#include <lora/variable_space_impl.h> // to store std::vector<TIntVector>
//=========================================================================================== //!\brief Maze task (environment+task) module class MMazeTaskModule : public TModuleInterface //=========================================================================================== { public: typedef TModuleInterface TParent; typedef MMazeTaskModule TThis; SKYAI_MODULE_NAMES(MMazeTaskModule) MMazeTaskModule (const std::string &v_instance_name) : TParent (v_instance_name), conf_ (TParent::param_box_config_map()) { } protected: TMazeTaskConfigurations conf_; }; // end of MMazeTaskModule //-------------------------------------------------------------------------------------------
MAKE_SLOT_PORT(slot_start, void, (void), (), TThis); MAKE_SLOT_PORT(slot_execute_action, void, (const TInt &a), (a), TThis); MAKE_SIGNAL_PORT(signal_initialization, void (void), TThis); MAKE_SIGNAL_PORT(signal_start_of_episode, void (void), TThis); MAKE_SIGNAL_PORT(signal_finish_episode, void (void), TThis); MAKE_SIGNAL_PORT(signal_end_of_episode, void (void), TThis); MAKE_SIGNAL_PORT(signal_start_of_step, void (void), TThis); MAKE_SIGNAL_PORT(signal_end_of_step, void (void), TThis); MAKE_SIGNAL_PORT(signal_reward, void (const TSingleReward &), TThis); MAKE_OUT_PORT(out_state_set_size, const TInt&, (void), (), TThis); MAKE_OUT_PORT(out_action_set_size, const TInt&, (void), (), TThis); MAKE_OUT_PORT(out_state, const TInt&, (void), (), TThis); MAKE_OUT_PORT(out_time, const TReal&, (void), (), TThis);
MMazeTaskModule (const std::string &v_instance_name) : ... slot_start (*this), slot_execute_action (*this), signal_initialization (*this), signal_start_of_episode (*this), signal_finish_episode (*this), signal_end_of_episode (*this), signal_start_of_step (*this), signal_end_of_step (*this), signal_reward (*this), out_state_set_size (*this), out_action_set_size (*this), out_state (*this), out_time (*this)
add_slot_port (slot_start ); add_slot_port (slot_execute_action ); add_signal_port (signal_initialization ); add_signal_port (signal_start_of_episode ); add_signal_port (signal_finish_episode ); add_signal_port (signal_end_of_episode ); add_signal_port (signal_start_of_step ); add_signal_port (signal_end_of_step ); add_signal_port (signal_reward ); add_out_port (out_state_set_size ); add_out_port (out_action_set_size ); add_out_port (out_state ); add_out_port (out_time );
mutable int state_set_size_; const int action_set_size_; int current_action_; int pos_x_, pos_y_; mutable int tmp_state_; TReal current_time_; TInt num_episode_;
state_set_size_ (0), action_set_size_ (4), current_action_ (0),
virtual void slot_start_exec (void);Then, define it outside the class:
/*virtual*/void MMazeTaskModule::slot_start_exec (void) { init_environment(); signal_initialization.ExecAll(); for(num_episode_=0; num_episode_<conf_.NumEpisodes; ++num_episode_) { init_environment(); signal_start_of_episode.ExecAll(); bool running(true); while(running) { signal_start_of_step.ExecAll(); running= step_environment(); show_environment(); usleep(conf_.SleepUTime); if(current_time_>=conf_.MaxSteps) { signal_finish_episode.ExecAll(); running= false; } signal_end_of_step.ExecAll(); } signal_end_of_episode.ExecAll(); } }where we used the three member functions. These are declared at the protected section:
void init_environment (void); bool step_environment (void); void show_environment (void);and, defined outside the class:
void MMazeTaskModule::init_environment (void) { pos_x_= conf_.StartX; pos_y_= conf_.StartY; current_time_= 0.0l; }
bool MMazeTaskModule::step_environment (void) { int next_x(pos_x_), next_y(pos_y_); switch(current_action_) { case 0: ++next_x; break; // right case 1: --next_y; break; // up case 2: --next_x; break; // left case 3: ++next_y; break; // down default: LERROR("invalid action:"<<current_action_); } ++current_time_; signal_reward.ExecAll(conf_.StepCost); switch(conf_.Map[next_y][next_x]) { case 0: // free space pos_x_=next_x; pos_y_=next_y; break; case 1: // wall break; case 2: // goal pos_x_=next_x; pos_y_=next_y; signal_reward.ExecAll(conf_.GoalReward); signal_finish_episode.ExecAll(); return false; default: LERROR("invalid map element: "<<conf_.Map[next_y][next_x]); } return true; }
void MMazeTaskModule::show_environment (void) { int x(0),y(0); std::cout<<"("<<pos_x_<<","<<pos_y_<<") "<<current_time_<<"/"<<num_episode_<<std::endl; for(std::vector<std::vector<int> >::const_iterator yitr(conf_.Map.begin()),ylast(conf_.Map.end());yitr!=ylast;++yitr,++y) { x=0; for(std::vector<int>::const_iterator xitr(yitr->begin()),xlast(yitr->end());xitr!=xlast;++xitr,++x) { std::cout<<" "; if(x==pos_x_ && y==pos_y_) std::cout<<"R"; else if(x==conf_.StartX && y==conf_.StartY) std::cout<<"S"; else switch(*xitr) { case 0: std::cout<<" "; break; case 1: std::cout<<"#"; break; case 2: std::cout<<"G"; break; default: std::cout<<"?"; break; } } std::cout<<" "<<std::endl; } std::cout<<std::endl; }
virtual void slot_execute_action_exec (const TInt &a) { current_action_= a; } virtual const TInt& out_state_set_size_get (void) const { state_set_size_= conf_.Map[0].size() * conf_.Map.size(); return state_set_size_; } virtual const TInt& out_action_set_size_get (void) const { return action_set_size_; } virtual const TInt& out_state_get (void) const { return tmp_state_=serialize(pos_x_,pos_y_); } virtual const TReal& out_time_get (void) const { return current_time_; }where serialize is a protected member function defined as follows:
int serialize (int x, int y) const { return y * conf_.Map[0].size() + x; }
SKYAI_ADD_MODULE(MMazeTaskModule)
That's it.
Next, in order to test the MMazeTaskModule module, we make a module named MRandomActionModule that emits a random action at each step. MRandomActionModule has two ports:
Thus, its implementation is very simple:
//=========================================================================================== //!\brief Random action module class MRandomActionModule : public TModuleInterface //=========================================================================================== { public: typedef TModuleInterface TParent; typedef MRandomActionModule TThis; SKYAI_MODULE_NAMES(MRandomActionModule) MRandomActionModule (const std::string &v_instance_name) : TParent (v_instance_name), slot_step (*this), signal_action (*this) { add_slot_port (slot_step ); add_signal_port (signal_action); } protected: MAKE_SLOT_PORT(slot_step, void, (void), (), TThis); MAKE_SIGNAL_PORT(signal_action, void (const TInt &), TThis); virtual void slot_step_exec (void) { signal_action.ExecAll(rand() % 4); } }; // end of MRandomActionModule //-------------------------------------------------------------------------------------------
Then, use SKYAI_ADD_MODULE macro to register the module on SkyAI:
SKYAI_ADD_MODULE(MRandomActionModule)
Refer to ../Tutorial - Making Executable.
Our main function is as follows:
using namespace std; using namespace loco_rabbits; int main(int argc, char**argv) { TOptionParser option(argc,argv); TAgent agent; if (!ParseCmdLineOption (agent, option)) return 0; MMazeTaskModule *p_maze_task = dynamic_cast<MMazeTaskModule*>(agent.SearchModule("maze_task")); if(p_maze_task==NULL) {LERROR("module `maze_task' is not defined as an instance of MMazeTaskModule"); return 1;} agent.SaveToFile (agent.GetDataFileName("before.agent"),"before-"); p_maze_task->Start(); agent.SaveToFile (agent.GetDataFileName("after.agent"),"after-"); return 0; }
This main function consists of the following parts:
First, write a makefile as follows:
BASE_REL_DIR:=../.. include $(BASE_REL_DIR)/Makefile_preconf EXEC := maze.out OBJS := maze.o USING_SKYAI_ODE:=true MAKING_SKYAI:=true include $(BASE_REL_DIR)/Makefile_body
Then, execute the make command:
make
An executable named maze.out is generated?
Now, let's test MMazeTaskModule using MRandomActionModule.
module MMazeTaskModule maze_task module MRandomActionModule rand_action
connect maze_task.signal_start_of_step , rand_action.slot_step connect rand_action.signal_action , maze_task.slot_execute_action
maze_task.config={ Map={ []= (1,1,1,1,1,1,1,1,1,1) []= (1,0,0,0,1,0,0,0,2,1) []= (1,0,1,0,1,0,0,0,0,1) []= (1,0,1,0,1,1,0,0,0,1) []= (1,0,1,0,0,1,0,1,1,1) []= (1,0,0,0,0,1,0,0,0,1) []= (1,0,0,0,0,0,0,0,0,1) []= (1,1,1,1,1,1,1,1,1,1) } StartX= 1 StartY= 3 }
That's it. Let's test!
Launch the executable as follows:
./maze.out -agent random_act
You will see a maze as follows where the robot (R) moves randomly.
(1,5) 77/4 # # # # # # # # # # # # G # # # # # # S # # # # # # # # # # # R # # # # # # # # # # # # # #
If you can make sure that MMazeTaskModule works correctly, then, let's apply a Q-learning module.
include_once "ql_dsda"
module MMazeTaskModule maze_task module MTDDiscStateAct behavior
/// initialization process: connect maze_task.signal_initialization , behavior.slot_initialize /// start of episode process: connect maze_task.signal_start_of_episode , behavior.slot_start_episode /// learning signals: connect behavior.signal_execute_action , maze_task.slot_execute_action connect maze_task.signal_end_of_step , behavior.slot_finish_action connect maze_task.signal_reward , behavior.slot_add_to_reward connect maze_task.signal_finish_episode , behavior.slot_finish_episode_immediately /// I/O: connect maze_task.out_action_set_size , behavior.in_action_set_size connect maze_task.out_state_set_size , behavior.in_state_set_size connect maze_task.out_state , behavior.in_state connect maze_task.out_time , behavior.in_cont_time
maze_task.config={ Map={ []= (1,1,1,1,1,1,1,1,1,1) []= (1,0,0,0,1,0,0,0,2,1) []= (1,0,1,0,1,0,0,0,0,1) []= (1,0,1,0,1,1,0,0,0,1) []= (1,0,1,0,0,1,0,1,1,1) []= (1,0,0,0,0,1,0,0,0,1) []= (1,0,0,0,0,0,0,0,0,1) []= (1,1,1,1,1,1,1,1,1,1) } StartX= 1 StartY= 3 }
behavior.config={ UsingEligibilityTrace = true UsingReplacingTrace = true Lambda = 0.9 GradientMax = 1.0e+100 ActionSelection = "asBoltzman" PolicyImprovement = "piExpReduction" Tau = 1 TauDecreasingFactor = 0.05 TraceMax = 1.0 Gamma = 0.9 Alpha = 0.3 AlphaDecreasingFactor = 0.002 AlphaMin = 0.05 }
Launch the executable as follows:
./maze.out -path ../../benchmarks/cmn -agent ql -outdir result/rl1
where ../../benchmarks/cmn is a relative path of the benchmarks/cmn directory; modify it for your environment.
After several tens of episodes, the policy will converge to a path:
(1,4) 1/520 # # # # # # # # # # # # G # # # # # # S # # # # # R # # # # # # # # # # # # # # # # # # # #
(3,6) 5/520 # # # # # # # # # # # # G # # # # # # S # # # # # # # # # # # # # # R # # # # # # # # # # #
(6,6) 8/520 # # # # # # # # # # # # G # # # # # # S # # # # # # # # # # # # # # R # # # # # # # # # # #
(7,3) 12/520 # # # # # # # # # # # # G # # # # # # S # # # R # # # # # # # # # # # # # # # # # # # # # #
(8,1) 15/520 # # # # # # # # # # # # R # # # # # # S # # # # # # # # # # # # # # # # # # # # # # # # #
In order to store the learning logs, make a directory result/rl1 which is specified with -outdir option. Plotting log-eps-ret.dat, you will obtain a learning curve: