{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gym\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "SEED = 42  \n",
    "\n",
    "def get_discrete_state(state, env, discrete_os_win_size):\n",
    "    discrete_state = (state - env.observation_space.low) / discrete_os_win_size\n",
    "    return tuple(discrete_state.astype(int))\n",
    "\n",
    "def test_agent(env, q_table, get_discrete_state_func, D, episodes=20):\n",
    "    total_rewards = []\n",
    "    for a in range(episodes):\n",
    "        state, _ = env.reset(seed=SEED + 100000 + a)\n",
    "        ds = get_discrete_state_func(state)\n",
    "        done = False\n",
    "        total_reward = 0\n",
    "\n",
    "        while not done:\n",
    "            qs = q_table[ds]\n",
    "            max_q = np.max(qs)\n",
    "            for action in range(env.action_space.n):\n",
    "                if max_q - qs[action] <= D:\n",
    "                    break\n",
    "            new_state, reward, terminated, truncated, _ = env.step(action)\n",
    "            done = terminated or truncated\n",
    "            total_reward += reward\n",
    "            ds = get_discrete_state_func(new_state)\n",
    "        total_rewards.append(total_reward)\n",
    "\n",
    "    return np.mean(total_rewards)\n",
    "\n",
    "def run_with_D(D, episodes=10000, test_every=200, seed_offset=0):\n",
    "    env = gym.make(\"MountainCar-v0\", render_mode=None)\n",
    "    env_unwrapped = env.unwrapped\n",
    "\n",
    "    bins = [20, 20]\n",
    "    discrete_os_win_size = (env.observation_space.high - env.observation_space.low) / bins\n",
    "    q_table = np.random.uniform(low=-2, high=0, size=(bins + [env.action_space.n]))\n",
    "\n",
    "    def get_ds(state):\n",
    "        return get_discrete_state(state, env, discrete_os_win_size)\n",
    "\n",
    "    lr = 0.1\n",
    "    gamma = 0.95\n",
    "    test_rewards = []\n",
    "\n",
    "    for ep in range(episodes):\n",
    "        state, _ = env.reset(seed=SEED + ep + seed_offset)\n",
    "        ds = get_ds(state)\n",
    "        done = False\n",
    "\n",
    "        epsilon = max(0.01, 1 - ep / 500)\n",
    "\n",
    "        while not done:\n",
    "            if np.random.rand() < epsilon:\n",
    "                action = env.action_space.sample()\n",
    "            else:\n",
    "                qs = q_table[ds]\n",
    "                max_q = np.max(qs)\n",
    "                for a in range(env.action_space.n):\n",
    "                    if max_q - qs[a] <= D:\n",
    "                        action = a\n",
    "                        break\n",
    "\n",
    "            ns, r, terminated, truncated, _ = env.step(action)\n",
    "            done = terminated or truncated\n",
    "            nds = get_ds(ns)\n",
    "\n",
    "            if not done:\n",
    "                future = np.max(q_table[nds])\n",
    "                current_q = q_table[ds + (action,)]\n",
    "                q_table[ds + (action,)] = (1 - lr) * current_q + lr * (r + gamma * future)\n",
    "            else:\n",
    "                if ns[0] >= env_unwrapped.goal_position:\n",
    "                    q_table[ds + (action,)] = 0\n",
    "\n",
    "            ds = nds\n",
    "\n",
    "        if (ep + 1) % test_every == 0:\n",
    "            avg_r = test_agent(env, q_table, get_ds, D, episodes=100)\n",
    "            test_rewards.append(avg_r)\n",
    "\n",
    "    env.close()\n",
    "    return test_rewards\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    Ds = [0,0.001,0.005,0.01,0.02]\n",
    "    episodes = 10000\n",
    "    test_every = 200\n",
    "    runs_per_D = 25\n",
    "\n",
    "    for D in Ds:\n",
    "        filename = f\"results2_D={D}.npy\"\n",
    "        if os.path.exists(filename):\n",
    "            \n",
    "            all_curves = np.load(filename, allow_pickle=True).item()\n",
    "        else:\n",
    "            all_curves = {D: []}\n",
    "            \n",
    "            for run in range(runs_per_D):\n",
    "                \n",
    "                curve = run_with_D(D, episodes=episodes, test_every=test_every, seed_offset=run * 10000)\n",
    "                all_curves[D].append(curve)\n",
    "            \n",
    "            np.save(filename, all_curves)\n",
    "\n",
    "      \n",
    "        data = np.array(all_curves[D])\n",
    "        mean = data.mean(axis=0)\n",
    "        std = data.std(axis=0)\n",
    "\n",
    "        \n",
    "        plt.figure(figsize=(12, 6))\n",
    "        x_ticks = np.arange(test_every, episodes + 1, test_every)\n",
    "        window = 5\n",
    "        mean_smooth = np.convolve(mean, np.ones(window) / window, mode='valid')\n",
    "        std_smooth = np.convolve(std, np.ones(window) / window, mode='valid')\n",
    "        x_smooth = x_ticks[:len(mean_smooth)]\n",
    "\n",
    "        plt.plot(x_smooth, mean_smooth, label=f\"D={D}\")\n",
    "        plt.fill_between(x_smooth, mean_smooth - std_smooth, mean_smooth + std_smooth, alpha=0.3)\n",
    "\n",
    "        plt.xlabel(\"Episode\")\n",
    "        plt.ylabel(\"Average Test Reward\")\n",
    "        plt.title(f\"MountainCar-v0 Q-learning with ε-greedy + Threshold D={D}\\n(Mean ± Std, Smoothed)\")\n",
    "        plt.grid(True)\n",
    "        plt.legend()\n",
    "        plt.show()\n",
    "\n",
    "\n",
    "Ds = [0,0.001,0.005,0.02] \n",
    "episodes = 10000\n",
    "test_every = 200\n",
    "window = 1  \n",
    "\n",
    "mean_curves = {}\n",
    "std_curves = {}\n",
    "\n",
    "for D in Ds:\n",
    "    filename = f\"results2_D={D}.npy\"\n",
    "    if not os.path.exists(filename):\n",
    "        \n",
    "        continue\n",
    "\n",
    "    \n",
    "    all_curves = np.load(filename, allow_pickle=True).item()\n",
    "    data = np.array(all_curves[D])  # shape: [runs, test_points]\n",
    "    mean = data.mean(axis=0)\n",
    "    std = data.std(axis=0)\n",
    "\n",
    "    mean_curves[D] = mean\n",
    "    std_curves[D] = std\n",
    "\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "x_ticks = np.arange(test_every, episodes + 1, test_every)\n",
    "\n",
    "for D in Ds:\n",
    "    if D not in mean_curves:\n",
    "        continue\n",
    "    mean = mean_curves[D]\n",
    "    std = std_curves[D]\n",
    "    mean_smooth = np.convolve(mean, np.ones(window)/window, mode='valid')\n",
    "    std_smooth = np.convolve(std, np.ones(window)/window, mode='valid')\n",
    "    x_smooth = x_ticks[:len(mean_smooth)]\n",
    "    plt.plot(x_smooth, mean_smooth,label=fr\"$r_{{\\mathrm{{action}}}}={D}$\")\n",
    "    plt.fill_between(x_smooth, mean_smooth - std_smooth, mean_smooth + std_smooth, alpha=0.3)\n",
    "\n",
    "\n",
    "\n",
    "plt.grid(True)\n",
    "plt.xlim(200,10000)\n",
    "plt.legend()\n",
    "plt.savefig(\"mountaincar_tabularql_vs_D.png\", dpi=600)\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
