{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "# custom libraries \n",
    "from envs import FourRooms\n",
    "from agents import GPIAgent\n",
    "from runners import run_experiment_episodic\n",
    "from utils import load_results\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the representations - dicts of the form {room: {lambda: {LR, deltas}, policy}}\n",
    "four_rooms_agents = load_results('fourrooms_lrs_stay')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "goals = [80, 24, 41]\n",
    "goal_rewards = [5, 10, 5]\n",
    "env_lambda_ = 0.5\n",
    "n_eps = 50\n",
    "agent_lambdas = [0.0, 0.5, 1.0]\n",
    "\n",
    "max_diff = -np.inf\n",
    "results = {}\n",
    "for agent_lambda_ in agent_lambdas:\n",
    "  env = FourRooms(\n",
    "      goals=goals,\n",
    "      start_state=-1,  # random start state\n",
    "      goal_rewards=goal_rewards,\n",
    "      lambda_=env_lambda_,\n",
    "      discount=0.97)\n",
    "\n",
    "  w = env.r\n",
    "  rooms = [0, 1, 2, 3]\n",
    "  LRs = [four_rooms_agents[r][agent_lambda_]['LR'] for r in rooms]\n",
    "  gpi_agent = GPIAgent(\n",
    "      env._layout.size, 5, w, LRs)\n",
    "\n",
    "  gpi_results = run_experiment_episodic(\n",
    "      env, gpi_agent, n_eps, display_eps=10, respect_done=True, max_ep_len=40\n",
    "  )\n",
    "\n",
    "  mean_gpi, ste_gpi = np.mean(gpi_results['return hist']), np.std(gpi_results['return hist']) / np.sqrt(n_eps)\n",
    "  results[agent_lambda_] = {'mean': mean_gpi, 'std': ste_gpi}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "goals = [80, 24, 41]\n",
    "goal_rewards = [5, 10, 5]\n",
    "env_lambda_ = 0.5\n",
    "# agent_lambda_ = 0.5\n",
    "n_eps = 50\n",
    "agent_lambdas = [0.0, 0.5, 1.0]\n",
    "\n",
    "max_diff = -np.inf\n",
    "traj_results = {}\n",
    "for start_state in env._possible_reward_states:\n",
    "    if start_state not in goals:\n",
    "        results = {}\n",
    "        traj_results = {}\n",
    "        for agent_lambda_ in agent_lambdas:\n",
    "            env = FourRooms(\n",
    "                goals=goals,\n",
    "                start_state=start_state,  # random start state\n",
    "                goal_rewards=goal_rewards,\n",
    "                lambda_=env_lambda_,\n",
    "                discount=0.97)\n",
    "\n",
    "            w = env.r\n",
    "            # pre-programmed allowed goals are: 80(r0), 24 (r1), 41 (r2), 95 (r3)\n",
    "\n",
    "            rooms = [0, 1, 2, 3]\n",
    "            LRs = [four_rooms_agents[r][agent_lambda_]['LR'] for r in rooms]\n",
    "            gpi_agent = GPIAgent(\n",
    "                env._layout.size, 5, env.get_obs(), w, LRs)\n",
    "\n",
    "            gpi_results = run_experiment_episodic(\n",
    "                env, gpi_agent, 1, display_eps=1, respect_done=True, max_ep_len=40\n",
    "            )\n",
    "\n",
    "            mean_gpi, ste_gpi = np.mean(gpi_results['return hist']), np.std(gpi_results['return hist']) / np.sqrt(n_eps)\n",
    "            results[agent_lambda_] = {'mean': mean_gpi, 'std': ste_gpi}\n",
    "            traj_results[agent_lambda_] = gpi_results['trajectory']\n",
    "\n",
    "        diff = results[0.5]['mean'] - np.mean([results[0.0]['mean'], results[1.0]['mean']])\n",
    "        if diff > max_diff:\n",
    "            max_diff = diff\n",
    "            best_start = start_state\n",
    "            best_traj_results = traj_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "goals = [80, 24, 41]\n",
    "goal_rewards = [5, 10, 5]\n",
    "env_lambda_ = 0.5\n",
    "n_eps = 50\n",
    "agent_lambdas = [0.0, 0.5, 1.0]\n",
    "episode_datas = {}\n",
    "for agent_lambda_ in agent_lambdas:\n",
    "    env = FourRooms(\n",
    "        goals=goals,\n",
    "        start_state=69,  # fixed start state\n",
    "        goal_rewards=goal_rewards,\n",
    "        lambda_=env_lambda_,\n",
    "        discount=0.97)\n",
    "\n",
    "    w = env.r\n",
    "\n",
    "    rooms = [0, 1, 2, 3]\n",
    "    LRs = [four_rooms_agents[r][agent_lambda_]['LR'] for r in rooms]\n",
    "    gpi_agent = GPIAgent(\n",
    "        env._layout.size, 5, env.get_obs(), w, LRs)\n",
    "\n",
    "    gpi_results = run_experiment_episodic(\n",
    "        env, gpi_agent, 1, display_eps=1, respect_done=True, max_ep_len=40\n",
    "    )\n",
    "\n",
    "    mean_gpi, ste_gpi = np.mean(gpi_results['return hist']), np.std(gpi_results['return hist']) / np.sqrt(n_eps)\n",
    "    episode_datas[agent_lambda_] = gpi_results['episode_data']\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
