{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "anticipated-participant",
   "metadata": {},
   "source": [
    "Reference : https://github.com/MeepMoop/tilecoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "korean-theme",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorboardX import SummaryWriter\n",
    "import datetime\n",
    "from plotting import confidence_plot\n",
    "import numpy as np\n",
    "import gym"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "recognized-empire",
   "metadata": {},
   "outputs": [],
   "source": [
    "class TileCoder:\n",
    "    \n",
    "    def __init__(self,features,num_tiles,width,offset,env):\n",
    "        \n",
    "        self.n = (width**2)*num_tiles\n",
    "        self.num_tiles= num_tiles\n",
    "        self.offset = offset\n",
    "        self.width = width\n",
    "        self.num_features = len(features)\n",
    "        self.num_actions = env.action_space.n\n",
    "        self.__max_size = self.n\n",
    "        self.tiles = self.generate_tiles(features,width)\n",
    "        \n",
    "    @property\n",
    "    def max_size(self):\n",
    "        return self.__max_size\n",
    "    \n",
    "    def discretize(self,feature):\n",
    "\n",
    "        return np.linspace(feature[0],feature[1],self.width+1)[1:-1]\n",
    "\n",
    "    \n",
    "    def generate_tiles(self,features,width):\n",
    "        '''\n",
    "        only called at initialization\n",
    "        return [tile_1:[feature_1_tile,feature_2_tile]...]\n",
    "        '''\n",
    "        tiles = []\n",
    "        \n",
    "        discretiezed_features = np.array(list(map(lambda x:self.discretize(x),features))) # 2*(width-1)\n",
    "        tiles = np.array(list(map(lambda x:discretiezed_features+x.reshape(2,1),self.offset))) #NUM_TIELS*2*(width-1)\n",
    " \n",
    "            \n",
    "        return tiles\n",
    "    \n",
    "    \n",
    "    def decode(self,state): #->List[int]:\n",
    "        '''\n",
    "        input : feature from gym\n",
    "        return List:[[tile_1_feature_1,tile_1_feature_2],...]\n",
    "        '''\n",
    "\n",
    "        decoded_features = np.array([np.digitize(s,t) for tile in self.tiles for s,t in zip(state,tile)]).reshape(self.num_tiles,2)\n",
    "        \n",
    "        return decoded_features\n",
    "    \n",
    "    def getFeatures(self,state,action):#->List[int]:\n",
    "        '''\n",
    "        return [decoed_feature_idx]\n",
    "        '''\n",
    "        decoded_features= self.decode(state)\n",
    "        \n",
    "        result = [tile_coord[0]+tile_coord[1]*self.width+(self.width**2)*idx+self.n*action for idx,tile_coord in enumerate(decoded_features)]\n",
    "        one_hot_vector = np.zeros((self.n*self.num_actions,1))\n",
    "        one_hot_vector[result]=1\n",
    "        return one_hot_vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "domestic-course",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Q_Learning:\n",
    "    \n",
    "    def __init__(self,state_size,action_size,tileCoder,writer,config):\n",
    "        \n",
    "        self.w = np.random.uniform(-1,1,size=(state_size*action_size,1))\n",
    "        self.num_actions = action_size\n",
    "        \n",
    "        self.gamma = config['gamma']\n",
    "        self.learning_rate = config['learning_rate']\n",
    "        \n",
    "        self.target_policy = self.greedy_policy\n",
    "        self.behavior_policy = self.epsilon_greedy_policy\n",
    "        self.tileCoder = tileCoder\n",
    "\n",
    "    def greedy_policy(self,q_values):\n",
    "        \n",
    "        action = np.random.choice(np.where(action_values==action_values.max())[0])\n",
    "        \n",
    "        return action\n",
    "    \n",
    "    def epsilon_greedy_policy(self,q_values):\n",
    "        \n",
    "        if np.random.random()<self.epsilon:\n",
    "            action = np.random.randint(0,self.num_actions)\n",
    "        else:\n",
    "            action = self.greedy_policy(q_values)\n",
    "        return action\n",
    "    \n",
    "    def update(self,state,next_state,action,reward,done):\n",
    "        '''\n",
    "        state : List[]\n",
    "        '''\n",
    "        current_action_value = np.einsum('ij,ij->j',state,self.w)\n",
    "\n",
    "        next_action_value = np.max([np.einsum('ij,ij->j',self.tileCoder.getFeatures(next_state,a),self.w) for a in range(self.num_actions)])\n",
    "        \n",
    "        td_error = (reward + done * self.gamma * next_action_value - current_action_value)\n",
    "        \n",
    "        self.w += self.learning_rate*(td_error*state)\n",
    "        \n",
    "        return td_error\n",
    "    def on_epoch_end(self):\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "impaired-player",
   "metadata": {},
   "outputs": [],
   "source": [
    "class RegQ:\n",
    "    \n",
    "    def __init__(self,state_size,action_size,tileCoder,writer,config):\n",
    "        \n",
    "        self.w = np.random.uniform(-1,1,size=(state_size*action_size,1))  #np.random.uniform(-1,1,size=(state_size,action_size))\n",
    "        \n",
    "        self.gamma = config['gamma']\n",
    "        self.learning_rate = config['learning_rate']\n",
    "        self.eta = config['eta']\n",
    "        self.eta_decay_rate = config['eta_decay_rate']\n",
    "        self.tileCoder = tileCoder\n",
    "        self.num_actions = action_size\n",
    "        \n",
    "        self.target_policy = self.greedy_policy\n",
    "        self.behavior_policy = self.epsilon_greedy_policy\n",
    "\n",
    "    def greedy_policy(self,q_values):\n",
    "        \n",
    "        action = np.random.choice(np.where(action_values==action_values.max())[0])\n",
    "        \n",
    "        return action\n",
    "    \n",
    "    def epsilon_greedy_policy(self,q_values):\n",
    "        \n",
    "        if np.random.random()<self.epsilon:\n",
    "            action = np.random.randint(0,self.num_actions)\n",
    "        else:\n",
    "            action = self.greedy_policy(q_values)\n",
    "        return action\n",
    "    \n",
    "    def update(self,state,next_state,action,reward,done):\n",
    "        \n",
    "        current_action_value = np.einsum('ij,ij->j',state,self.w)\n",
    "\n",
    "        next_action_value = np.max([np.einsum('ij,ij->j',self.tileCoder.getFeatures(next_state,a),self.w) for a in range(self.num_actions)])\n",
    "        \n",
    "  \n",
    "        td_error = reward + done * self.gamma * next_action_value - current_action_value - self.eta * current_action_value \n",
    "        \n",
    "        self.w += self.learning_rate*(td_error*state)\n",
    "        \n",
    "        return td_error\n",
    "    \n",
    "    def on_epoch_end(self):\n",
    "        self.eta = max(0,self.eta - self.eta_decay_rate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "coated-engine",
   "metadata": {},
   "outputs": [],
   "source": [
    "class GradientQ2:\n",
    "    \n",
    "    def __init__(self,num_states,num_actions,writer,config):\n",
    "        \n",
    "        self.q_table = np.zeros((num_states,num_actions))   #np.random.uniform(-1,1,size=(state_size,action_size))\n",
    "        self.w_table = np.zeros((num_states,num_actions))\n",
    "        \n",
    "        self.num_states = num_states\n",
    "        self.num_actions = num_actions\n",
    "        \n",
    "        \n",
    "        self.gamma = config['gamma']\n",
    "        self.learning_rate = config['learning_rate']\n",
    "        \n",
    "        self.target_policy = self.greedy_policy\n",
    "        self.behavior_policy = self.epsilon_greedy_policy\n",
    "        \n",
    "        self.writer = writer\n",
    "        self.updates = 0\n",
    "\n",
    "    def greedy_policy(self,q_values):\n",
    "        \n",
    "        action = np.random.choice(np.where(action_values==action_values.max())[0])\n",
    "        \n",
    "        return action\n",
    "    \n",
    "    def epsilon_greedy_policy(self,q_values):\n",
    "        \n",
    "        if np.random.random()<self.epsilon:\n",
    "            action = np.random.randint(0,self.num_actions)\n",
    "        else:\n",
    "            action = self.greedy_policy(q_values)\n",
    "        return action\n",
    "    \n",
    "    def update(self,state,next_state,action,reward,done_mask):\n",
    "\n",
    "        \n",
    "        current_feature_vector = np.zeros((self.num_states,self.num_actions))\n",
    "        current_feature_vector[state,action] = 1\n",
    "        \n",
    "        next_value = np.sum(self.q_table[next_state],axis=0)\n",
    "        next_action = np.random.choice(np.where(next_value == next_value.max())[0])\n",
    "        next_feature_vector = np.zeros((self.num_states,self.num_actions))\n",
    "        next_feature_vector[next_state,next_action] = 1\n",
    "        \n",
    "        phi_w= np.sum(self.w_table[state,action],axis=0) #phi^T * w(s,a)\n",
    "        current_action_value = np.sum(self.q_table[state,action],axis=0)\n",
    "        \n",
    "        phi_theta = np.sum(self.q_table[state,action],axis=0)\n",
    "        \n",
    "        next_action_value = next_value[next_action]\n",
    "\n",
    "        gradient_theta = (current_feature_vector - done_mask *self.gamma * next_feature_vector)*phi_w - 0.001 *current_feature_vector * phi_theta #0.001 * self.q_table \n",
    "        self.q_table = np.clip(self.q_table+ self.learning_rate * gradient_theta,-400,400)\n",
    "        \n",
    "        td_error = reward + done_mask * self.gamma * next_action_value - current_action_value\n",
    "        gradient_w = (td_error-phi_w)*current_feature_vector\n",
    "        \n",
    "        self.w_table = np.clip(self.w_table + self.learning_rate * gradient_w,-400,400)\n",
    "        \n",
    "        self.updates += 1\n",
    "#         print('q',self.q_table[state,action])\n",
    "#         print('w',self.w_table[state,action])\n",
    "\n",
    "        self.writer.add_scalar('Current Action Value',current_action_value,self.updates)\n",
    "        self.writer.add_scalar('Next Action Value',next_action_value,self.updates)\n",
    "        self.writer.add_scalar('Max Gradient Theta',np.max(gradient_theta),self.updates)\n",
    "        self.writer.add_scalar('TD Error',td_error,self.updates)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 450,
   "id": "brazilian-cleaner",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_agent(agent_type):\n",
    "    agent_dict = {'q':Q_Learning,'RegQ':RegQ,'gq':GradientQ, 'rgq':GradientQ2}\n",
    "    return agent_dict[agent_type]\n",
    "\n",
    "    \n",
    "def train(agent_type,config,model=None):\n",
    "    \n",
    "    # Import and initialize Mountain Car Environment\n",
    "    env = gym.make('MountainCar-v0')\n",
    "    env.reset()\n",
    "\n",
    "    features = np.array([env.observation_space.low,env.observation_space.high]).T\n",
    "    delta = env.observation_space.high-env.observation_space.low\n",
    "\n",
    "\n",
    "    num_tiles = 5\n",
    "    width = 10\n",
    "    offset = [i*(delta/width)/num_tiles for i in range(num_tiles)]\n",
    "\n",
    "    tileCoder = TileCoder(features,num_tiles,width,offset,env)\n",
    "    epochs = 1000\n",
    "    \n",
    "    num_states = tileCoder.n\n",
    "    num_actions = env.action_space.n\n",
    "    \n",
    "    epsilon = config['epsilon']\n",
    "\n",
    "\n",
    "    file_name = datetime.datetime.now().strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
    "    writer = SummaryWriter(logdir=f'mountaincar_log/{file_name}')\n",
    "\n",
    "    agent = get_agent(agent_type)(num_states,num_actions,tileCoder,writer,config)\n",
    "    if not config['isTrain']:\n",
    "        agent.w = model\n",
    "        #agent.w = np.load(config['model_path'])\n",
    "    episode_reward_list = []\n",
    "    file_name = f'{agent_type}-tiles_{num_tiles}-width_{width}.npy'\n",
    "\n",
    "    for epoch in range(1,config['num_episodes']):\n",
    "\n",
    "        done = False\n",
    "        state = env.reset()\n",
    "        \n",
    "        episode_reward = 0\n",
    "        episode_steps = 0\n",
    "        epi_max_pos = -2 \n",
    "        epi_td_error = 0\n",
    "        epi_avg_pos = 0\n",
    "        epi_avg_vel = 0\n",
    "        epi_avg_action = 0\n",
    "\n",
    "        while not done:\n",
    "\n",
    "            if episode_steps > config['window_size']:\n",
    "                break\n",
    "\n",
    "            epi_max_pos = max(epi_max_pos,state[0])\n",
    "  \n",
    "\n",
    "            if np.random.random()<epsilon:\n",
    "                action = np.random.randint(0,num_actions)\n",
    "            else:\n",
    "                value = np.array([np.einsum('ij,ij->j',tileCoder.getFeatures(state,a),agent.w) for a in range(num_actions)])\n",
    "                action = np.random.choice(np.where( value == value.max())[0])\n",
    "                           \n",
    "                \n",
    "            encoded_state = tileCoder.getFeatures(state,action)\n",
    "            next_state,reward,done,info = env.step(action)\n",
    "    \n",
    "            done_mask = 0.0 if done else 1.0\n",
    "            \n",
    "            if config['isTrain']:\n",
    "                td_error = agent.update(encoded_state,next_state,action,reward,done_mask)\n",
    "                epi_td_error += td_error\n",
    "            \n",
    "\n",
    "            state = next_state\n",
    "            episode_reward += reward\n",
    "            episode_steps += 1\n",
    "            epi_avg_pos += state[0]\n",
    "            epi_avg_vel += state[1]\n",
    "            epi_avg_action += action\n",
    "            \n",
    "        if config['epsilon_decay']:\n",
    "            epsilon = max(config['min_epsilon'],epsilon-config['eta_decay_rate'])\n",
    "        \n",
    "        agent.on_epoch_end()\n",
    "    \n",
    "        writer.add_scalar('Episode Reward',episode_reward,epoch)\n",
    "        writer.add_scalar('Episode Maximum Position',epi_max_pos,epoch)\n",
    "        writer.add_scalar('Episode Avg Position',epi_avg_pos/episode_steps,epoch)\n",
    "        writer.add_scalar('Episode Avg Velocity',epi_avg_vel/episode_steps,epoch)\n",
    "        writer.add_scalar('Episode Avg Action',epi_avg_action/episode_steps,epoch)\n",
    "        writer.add_scalar('Episode Steps',episode_steps,epoch)\n",
    "        writer.add_scalar('Epsilons',epsilon,epoch)\n",
    "        writer.add_scalar('TD_ERROR',epi_td_error/episode_steps,epoch)\n",
    "        \n",
    "        episode_reward_list.append(episode_reward)\n",
    "\n",
    "        if epoch % config['num_print_episodes'] ==0 :\n",
    "            print(f\"Epoch:{epoch}, Episode Reward :{episode_reward}\")\n",
    "    \n",
    "    if config['isTrain']:\n",
    "        np.save(f'./mc_model/{file_name}',agent.w)\n",
    "        \n",
    "    return episode_reward_list,agent.w"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "formal-bookmark",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch:100, Episode Reward :-105.0\n",
      "Epoch:200, Episode Reward :-150.0\n",
      "Epoch:300, Episode Reward :-132.0\n",
      "Epoch:400, Episode Reward :-106.0\n",
      "Epoch:500, Episode Reward :-141.0\n",
      "Epoch:600, Episode Reward :-106.0\n",
      "Epoch:700, Episode Reward :-105.0\n",
      "Epoch:800, Episode Reward :-105.0\n",
      "Epoch:900, Episode Reward :-106.0\n"
     ]
    }
   ],
   "source": [
    "episode_reward_list = train('RegQ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "parliamentary-living",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-127.52452452452452\n"
     ]
    }
   ],
   "source": [
    "print(np.mean(episode_reward_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 473,
   "id": "passive-legend",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_config(isTrain):\n",
    "    \n",
    "    config = {'eta':0.1,'gamma':0.99,'learning_rate':0.1,'epsilon':0,'epsilon_decay':True,'min_epsilon':0.01,'isTrain':isTrain,\n",
    "              'num_episodes':10000,\n",
    "              'num_print_episodes':1000,\n",
    "              'window_size':200,\n",
    "             'model_path':'./mc_model/RegQ-tiles_5-width_10.npy'}\n",
    "    config['eta_decay_rate'] = 0   \n",
    "    if not isTrain:\n",
    "        config['num_episodes'] = 100\n",
    "    \n",
    "    return config \n",
    "\n",
    "def run():\n",
    "    Q = 'q'\n",
    "    RegQ = 'RegQ'\n",
    "    config = generate_config(isTrain=True)\n",
    "    test_config =  generate_config(isTrain=False)\n",
    "    num_runs = 100\n",
    "    print_run = 10\n",
    "    \n",
    "    total_test_reward_list = []\n",
    "    for run in range(num_runs):\n",
    "        \n",
    "        run_reward,weight = train(Q,config)\n",
    "\n",
    "        if run % print_run ==0 :\n",
    "            print(f'run:{run},reward:{np.mean(run_reward)}')\n",
    "        \n",
    "        test_reward,_ = train(Q,test_config,weight)\n",
    "        print(f'test_run reward:{np.mean(test_reward)}')\n",
    "        total_test_reward_list.append(test_reward)\n",
    "    return total_test_reward_list\n",
    "\n",
    "total_reward_list = run()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 447,
   "id": "mounted-civilian",
   "metadata": {},
   "outputs": [],
   "source": [
    "def easy_plot(runs = 1,xlabel='Number of Episodes',ylabel='Episode Returns'):\n",
    "    \n",
    "    save_dir = './figures'\n",
    "    \n",
    "    Q = 'q'\n",
    "    GQ = 'gq'\n",
    "    RGQ ='rgq'\n",
    "    RegQ= 'RegQ'\n",
    "    agent_types = [RegQ]  #[CQL,GQ,GGQ,GQV2]\n",
    "    COLORS = {RegQ:'orange',GQ:'red',Q:'green',RGQ:'blue'}\n",
    "    env_name = 'MountainCar'\n",
    "    suffix=\"\"\n",
    "    collector = Collector.remote()\n",
    "    \n",
    "    \n",
    "    for agent_type in agent_types:\n",
    "        for run in range(runs):\n",
    "            print('%s:%d-th run start!'%(agent_type,run))\n",
    "            np.random.seed(run)\n",
    "            episode_sum = train(agent_type)\n",
    "            collector.collect.remote(agent_type,episode_sum)\n",
    "\n",
    "    \n",
    "    \n",
    "    ax = plt.gca()\n",
    "    for name in ray.get(collector.get_all_data_keys.remote()):\n",
    "        data = ray.get(collector.getStats.remote(name))\n",
    "        confidence_plot(ax, data, label=name, color=COLORS[name])\n",
    "    plt.legend()\n",
    "    plt.ylabel(ylabel)\n",
    "    plt.xlabel(xlabel)\n",
    "    \n",
    "    save_dir = './figures'\n",
    "    file_name = env_name \n",
    "    path = os.path.join(save_dir,'%s.png'%file_name)\n",
    "    plt.savefig(path,dpi=200)\n",
    "    plt.show()\n",
    "\n",
    "ray.init(ignore_reinit_error=True)\n",
    "\n",
    "easy_plot(1)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
