{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, sys, time, random\n",
    "import numpy as np\n",
    "import gym\n",
    "import torch as T\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import torch.optim as optim\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "\n",
    "SEED = 222\n",
    "random.seed(SEED)\n",
    "np.random.seed(SEED)\n",
    "T.manual_seed(SEED)\n",
    "if T.cuda.is_available():\n",
    "    T.cuda.manual_seed(SEED)\n",
    "    T.cuda.manual_seed_all(SEED)\n",
    "T.backends.cudnn.deterministic = True\n",
    "T.backends.cudnn.benchmark = False\n",
    "\n",
    "env_name = 'Acrobot-v1'\n",
    "cuda_enabled = T.cuda.is_available()\n",
    "\n",
    "\n",
    "\n",
    "saved_model_path = 'saved_models/trained_model_'\n",
    "os.makedirs('saved_models', exist_ok=True)\n",
    "\n",
    "\n",
    "class DoubleDQN(nn.Module):\n",
    "    def __init__(self, lr, input_dims, n_actions):\n",
    "        super(DoubleDQN, self).__init__()\n",
    "        self.input_dims = input_dims\n",
    "        self.n_actions = n_actions\n",
    "        self.fc1 = nn.Linear(*self.input_dims, 256)\n",
    "        self.fc2 = nn.Linear(256, 512)\n",
    "        self.fc3 = nn.Linear(512, 512)\n",
    "        self.fc4 = nn.Linear(512, self.n_actions)\n",
    "\n",
    "        self.optimizer = optim.Adam(self.parameters(), lr=lr)\n",
    "        self.loss = nn.MSELoss()\n",
    "        self.device = T.device('cuda:0' if cuda_enabled else 'cpu')\n",
    "        self.to(self.device)\n",
    "\n",
    "    def forward(self, state):\n",
    "        x = F.relu(self.fc1(state))\n",
    "        x = F.relu(self.fc2(x))\n",
    "        x = F.relu(self.fc3(x))\n",
    "        return self.fc4(x)\n",
    "\n",
    "    def save_model(self, tag):\n",
    "        \n",
    "        T.save(self.state_dict(), saved_model_path + tag + '.pt')\n",
    "\n",
    "    def load_model(self, tag):\n",
    "        \n",
    "        self.load_state_dict(T.load(saved_model_path + tag + '.pt'))\n",
    "\n",
    "class Agent:\n",
    "    def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,\n",
    "                 D=0.0, max_mem_size=50000, eps_end=0.01, eps_dec=5e-4):\n",
    "        self.D = D\n",
    "        self.gamma = gamma\n",
    "        self.epsilon = epsilon\n",
    "        self.eps_min = eps_end\n",
    "        self.eps_dec = eps_dec\n",
    "        self.action_space = list(range(n_actions))\n",
    "        self.batch_size = batch_size\n",
    "        self.mem_size = max_mem_size\n",
    "        self.mem_cntr = 0\n",
    "        self.iter_cntr = 0\n",
    "        self.replace_target = 100\n",
    "\n",
    "        self.Q_local = DoubleDQN(lr, input_dims=input_dims, n_actions=n_actions)\n",
    "        self.Q_target = DoubleDQN(lr, input_dims=input_dims, n_actions=n_actions)\n",
    "\n",
    "        sz = self.mem_size\n",
    "        dims = input_dims\n",
    "        self.state_memory = np.zeros((sz, *dims), dtype=np.float32)\n",
    "        self.new_state_memory = np.zeros((sz, *dims), dtype=np.float32)\n",
    "        self.action_memory = np.zeros(sz, dtype=np.int64)\n",
    "        self.reward_memory = np.zeros(sz, dtype=np.float32)\n",
    "        self.terminal_memory = np.zeros(sz, dtype=bool)\n",
    "\n",
    "    def store_transition(self, state, action, reward, state_, done):\n",
    "        idx = self.mem_cntr % self.mem_size\n",
    "        self.state_memory[idx] = state\n",
    "        self.new_state_memory[idx] = state_\n",
    "        self.reward_memory[idx] = reward\n",
    "        self.action_memory[idx] = action\n",
    "        self.terminal_memory[idx] = done\n",
    "        self.mem_cntr += 1\n",
    "\n",
    "    def choose_action(self, obs):\n",
    "        state = T.from_numpy(obs).float().unsqueeze(0).to(self.Q_local.device)\n",
    "        if np.random.random() < self.epsilon:\n",
    "            return np.random.choice(self.action_space)\n",
    "        self.Q_local.eval()\n",
    "        with T.no_grad():\n",
    "            q_vals = self.Q_local(state).cpu().numpy().squeeze()\n",
    "        self.Q_local.train()\n",
    "        max_q = np.max(q_vals)\n",
    "        for i, q in enumerate(q_vals):\n",
    "            if max_q - q <= self.D:\n",
    "                return i\n",
    "        return int(np.argmax(q_vals))\n",
    "\n",
    "    def choose_action1(self, obs):\n",
    "        state = T.from_numpy(obs).float().unsqueeze(0).to(self.Q_local.device)\n",
    "        self.Q_local.eval()\n",
    "        with T.no_grad():\n",
    "            q_vals = self.Q_local(state).cpu().numpy().squeeze()\n",
    "        max_q = np.max(q_vals)\n",
    "        for i, q in enumerate(q_vals):\n",
    "            if max_q - q <= self.D:\n",
    "                return i\n",
    "        return int(np.argmax(q_vals))\n",
    "\n",
    "    def replace_target_network(self):\n",
    "        if self.iter_cntr % self.replace_target == 0:\n",
    "            self.Q_target.load_state_dict(self.Q_local.state_dict())\n",
    "\n",
    "    def learn(self):\n",
    "        if self.mem_cntr < self.batch_size:\n",
    "            return\n",
    "        self.Q_local.optimizer.zero_grad()\n",
    "        self.replace_target_network()\n",
    "\n",
    "        max_mem = min(self.mem_cntr, self.mem_size)\n",
    "        batch = np.random.choice(max_mem, self.batch_size, replace=False)\n",
    "        idx = np.arange(self.batch_size)\n",
    "\n",
    "        states = T.tensor(self.state_memory[batch]).to(self.Q_local.device)\n",
    "        states_ = T.tensor(self.new_state_memory[batch]).to(self.Q_local.device)\n",
    "        actions = T.tensor(self.action_memory[batch]).to(self.Q_local.device)\n",
    "        rewards = T.tensor(self.reward_memory[batch]).to(self.Q_local.device)\n",
    "        dones = T.tensor(self.terminal_memory[batch]).to(self.Q_local.device)\n",
    "\n",
    "        q_pred = self.Q_local(states)[idx, actions]\n",
    "        q_next = self.Q_target(states_)\n",
    "        q_eval = self.Q_local(states_)\n",
    "        max_act = T.argmax(q_eval, dim=1)\n",
    "        q_next[dones] = 0.0\n",
    "        q_target = rewards + self.gamma * q_next[idx, max_act]\n",
    "\n",
    "        loss = self.Q_local.loss(q_target, q_pred)\n",
    "        loss.backward()\n",
    "        self.Q_local.optimizer.step()\n",
    "        self.iter_cntr += 1\n",
    "        self.epsilon = max(self.epsilon - self.eps_dec, self.eps_min)\n",
    "\n",
    "    def save_agent(self):\n",
    "        self.Q_local.save_model('local')\n",
    "        self.Q_target.save_model('target')\n",
    "\n",
    "    def load_agent(self):\n",
    "        self.Q_local.load_model('local')\n",
    "        self.Q_target.load_model('target')\n",
    "\n",
    "\n",
    "\n",
    "D_values = [0, 0.05,0.1,0.5,0.2]\n",
    "num_runs = 25\n",
    "epochs = 90\n",
    "batch_size = 8192\n",
    "\n",
    "all_scores = {}\n",
    "os.makedirs('25runsresultsseed222', exist_ok=True)  \n",
    "\n",
    "for D in D_values:\n",
    "    runs_scores = []\n",
    "    for run in range(num_runs):\n",
    "       \n",
    "\n",
    "        \n",
    "        env = gym.make(env_name)\n",
    "        env.reset(seed=SEED)\n",
    "        env.action_space.seed(SEED)\n",
    "        try:\n",
    "            env.observation_space.seed(SEED)\n",
    "        except:\n",
    "            pass\n",
    "\n",
    "        agent = Agent(\n",
    "            gamma=0.99, epsilon=1.0, lr=1e-5,\n",
    "            input_dims=env.observation_space.shape,\n",
    "            batch_size=batch_size,\n",
    "            n_actions=env.action_space.n,\n",
    "            D=D\n",
    "        )\n",
    "\n",
    "        scores1 = []\n",
    "        for ep in range(1, epochs+1):\n",
    "            obs, _ = env.reset(seed=ep + 200)\n",
    "            done = False\n",
    "            while not done:\n",
    "                action = agent.choose_action(obs)\n",
    "                obs_, reward, term, trunc, _ = env.step(action)\n",
    "                done = term or trunc\n",
    "                agent.store_transition(obs, action, reward, obs_, done)\n",
    "                agent.learn()\n",
    "                obs = obs_\n",
    "\n",
    "            \n",
    "            if ep % 10 == 0:\n",
    "                eval_score = 0\n",
    "                for a in range(100):\n",
    "                    obs, _ = env.reset(seed=a)\n",
    "                    done = False\n",
    "                    while not done:\n",
    "                        action = agent.choose_action1(obs)\n",
    "                        obs_, r, term, trunc, _ = env.step(action)\n",
    "                        done = term or trunc\n",
    "                        obs = obs_\n",
    "                        eval_score += r\n",
    "                avg_score = eval_score / 100.0\n",
    "                scores1.append(avg_score)\n",
    "                \n",
    "\n",
    "        runs_scores.append(scores1)\n",
    "        env.close()\n",
    "\n",
    "        \n",
    "        np.save(f\"25runsresultsseed222/scores_D_{D}_run_{run}.npy\", np.array(scores1))\n",
    "\n",
    "    all_scores[D] = np.array(runs_scores)\n",
    "\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "for D, data in all_scores.items():\n",
    "    mean_scores = data.mean(axis=0)\n",
    "    std_scores  = data.std(axis=0)\n",
    "    x = np.arange(len(mean_scores))\n",
    "    plt.plot(x, mean_scores, label=f\"D={D}\")\n",
    "    plt.fill_between(x, mean_scores-std_scores, mean_scores+std_scores, alpha=0.2)\n",
    "plt.title(f\"Double DQN on {env_name}: Mean ± Std over {num_runs} Runs\")\n",
    "plt.xlabel(\"Evaluation (per 10 episodes)\")\n",
    "plt.ylabel(\"Average Score\")\n",
    "plt.legend()\n",
    "plt.grid(True)\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
