{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import numpy as np\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "import torch.nn.functional as F\n",
    "import random\n",
    "from collections import namedtuple, deque\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import gym\n",
    "\n",
    "\n",
    "\n",
    "# Check if GPU is available\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "\n",
    "# Define the neural network model\n",
    "class QNetwork(nn.Module):\n",
    "    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64):\n",
    "        super(QNetwork, self).__init__()\n",
    "        self.seed = torch.manual_seed(seed)\n",
    "        self.fc1 = nn.Linear(state_size, fc1_units)\n",
    "        self.fc2 = nn.Linear(fc1_units, fc2_units)\n",
    "        self.fc3 = nn.Linear(fc2_units, action_size)\n",
    "        self.to(device)\n",
    "\n",
    "    def forward(self, state):\n",
    "        x = F.relu(self.fc1(state))\n",
    "        x = F.relu(self.fc2(x))\n",
    "        return self.fc3(x)\n",
    "\n",
    "class ReplayBuffer:\n",
    "    def __init__(self, action_size, buffer_size, batch_size, seed):\n",
    "        self.action_size = action_size\n",
    "        self.memory = deque(maxlen=buffer_size)\n",
    "        self.batch_size = batch_size\n",
    "        self.experience = namedtuple(\"Experience\", field_names=[\"state\", \"action\", \"reward\", \"next_state\", \"done\"])\n",
    "        self.seed = random.seed(seed)\n",
    "\n",
    "    def add(self, state, action, reward, next_state, done):\n",
    "        e = self.experience(state, action, reward, next_state, done)\n",
    "        self.memory.append(e)\n",
    "\n",
    "    def sample(self):\n",
    "        experiences = random.sample(self.memory, k=self.batch_size)\n",
    "\n",
    "        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)\n",
    "        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)\n",
    "        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)\n",
    "        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)\n",
    "        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)\n",
    "\n",
    "        return (states, actions, rewards, next_states, dones)\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.memory)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "class DQNAgent:\n",
    "    \n",
    "    def __init__(self, state_size, action_size, seed, lr, D=0.0):\n",
    "        self.state_size   = state_size\n",
    "        self.action_size  = action_size\n",
    "        self.seed         = random.seed(seed)\n",
    "        self.D            = D       \n",
    "\n",
    "        self.qnetwork_local  = QNetwork(state_size, action_size, seed).to(device)\n",
    "        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)\n",
    "        self.optimizer       = optim.Adam(self.qnetwork_local.parameters(), lr)\n",
    "\n",
    "        self.memory = ReplayBuffer(action_size, buffer_size=int(1e5), batch_size=64, seed=seed)\n",
    "        self.t_step = 0\n",
    "\n",
    "    def reset(self):\n",
    "        def weight_reset(m):\n",
    "            if hasattr(m, 'reset_parameters'):\n",
    "                m.reset_parameters()\n",
    "        self.qnetwork_local.apply(weight_reset)\n",
    "        self.qnetwork_target.apply(weight_reset)\n",
    "        self.memory = ReplayBuffer(self.action_size, buffer_size=int(1e5),\n",
    "                                   batch_size=64, seed=self.seed)\n",
    "        self.t_step = 0\n",
    "\n",
    "    def step(self, state, action, reward, next_state, done):\n",
    "        self.memory.add(state, action, reward, next_state, done)\n",
    "        self.t_step = (self.t_step + 1) % 4\n",
    "        if self.t_step == 0 and len(self.memory) > 64:\n",
    "            experiences = self.memory.sample()\n",
    "            self.learn(experiences, gamma=0.99)\n",
    "    def act1(self, state, eps=0.):\n",
    "        state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(device)\n",
    "\n",
    "        self.qnetwork_local.eval()\n",
    "        with torch.no_grad():\n",
    "            q_values = self.qnetwork_local(state_tensor).cpu().numpy().squeeze()\n",
    "        max_q = np.max(q_values)\n",
    "           \n",
    "        for i, qv in enumerate(q_values):\n",
    "            if max_q - qv <= self.D:\n",
    "                return i\n",
    "        return int(np.argmax(q_values))\n",
    "    \n",
    "    def act(self, state, eps=0.):\n",
    "        state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(device)\n",
    "\n",
    "     \n",
    "        self.qnetwork_local.eval()\n",
    "        with torch.no_grad():\n",
    "            q_values = self.qnetwork_local(state_tensor).cpu().numpy().squeeze()\n",
    "        self.qnetwork_local.train()\n",
    "\n",
    "        # ε-greedy\n",
    "        if np.random.random() < eps:\n",
    "            return np.random.randint(self.action_size)\n",
    "        else:\n",
    "            # exploitation with threshold D\n",
    "            max_q = np.max(q_values)\n",
    "           \n",
    "            for i, qv in enumerate(q_values):\n",
    "                if max_q - qv <= self.D:\n",
    "                    return i\n",
    "          \n",
    "            return int(np.argmax(q_values))\n",
    "\n",
    "    def learn(self, experiences, gamma):\n",
    "        states, actions, rewards, next_states, dones = zip(*experiences)\n",
    "        states      = torch.from_numpy(np.vstack(states)).float().to(device)\n",
    "        actions     = torch.from_numpy(np.vstack(actions)).long().to(device)\n",
    "        rewards     = torch.from_numpy(np.vstack(rewards)).float().to(device)\n",
    "        next_states = torch.from_numpy(np.vstack(next_states)).float().to(device)\n",
    "        dones       = torch.from_numpy(np.vstack(dones).astype(np.uint8)).float().to(device)\n",
    "\n",
    "        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)\n",
    "        Q_targets      = rewards + (gamma * Q_targets_next * (1 - dones))\n",
    "\n",
    "        Q_expected = self.qnetwork_local(states).gather(1, actions)\n",
    "        loss = F.mse_loss(Q_expected, Q_targets)\n",
    "\n",
    "        self.optimizer.zero_grad()\n",
    "        loss.backward()\n",
    "        self.optimizer.step()\n",
    "\n",
    "        self.soft_update(self.qnetwork_local, self.qnetwork_target, tau=1e-3)\n",
    "\n",
    "    def soft_update(self, local_model, target_model, tau):\n",
    "        for tp, lp in zip(target_model.parameters(), local_model.parameters()):\n",
    "            tp.data.copy_(tau * lp.data + (1.0 - tau) * tp.data)\n",
    "# Initialize the environment and the agent\n",
    "\n",
    "\n",
    "# Set up the environment\n",
    "env = gym.make(\"CartPole-v1\")\n",
    "\n",
    "# Define training parameters\n",
    "\n",
    "lr = 0.0025\n",
    "buffer_size = 10000\n",
    "buffer = deque(maxlen=buffer_size)\n",
    "\n",
    "\n",
    "# Initialize the DQNAgent\n",
    "input_dim = env.observation_space.shape[0]\n",
    "output_dim = env.action_space.n\n",
    "new_agent = DQNAgent(input_dim, output_dim, seed=170715, lr = lr)\n",
    "\n",
    "\n",
    "np.random.seed(111)\n",
    "\n",
    "def moving_average(data, window_size=10):\n",
    "    return np.convolve(data, np.ones(window_size)/window_size, mode='valid')\n",
    "\n",
    "\n",
    "D_values = [0,0.05,0.1,0.5]\n",
    "\n",
    "\n",
    "n_runs =50\n",
    "num_episodes = 400\n",
    "max_steps_per_episode = 500\n",
    "epsilon_start = 1.0\n",
    "epsilon_end = 0.01\n",
    "epsilon_decay_rate = 0.997\n",
    "batch_size = 256\n",
    "gamma = 0.99\n",
    "update_frequency = 50\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "for D in D_values:\n",
    "   \n",
    "    new_agent.D = D\n",
    "    \n",
    "  \n",
    "    save_dir = f\"training_rewards1/D_{D}\"\n",
    "    os.makedirs(save_dir, exist_ok=True)\n",
    "    \n",
    "    all_rewards = []  \n",
    "    \n",
    "    \n",
    "    for run in range(n_runs):\n",
    "        new_agent.reset()               \n",
    "        buffer = deque(maxlen=10000)\n",
    "        episode_rewards = []\n",
    "        \n",
    "      \n",
    "        for episode in range(num_episodes):\n",
    "            state, _ = env.reset(seed=episode+200)\n",
    "            epsilon = max(epsilon_end, epsilon_start * (epsilon_decay_rate ** episode))\n",
    "            total_reward = 0\n",
    "            \n",
    "            for step in range(max_steps_per_episode):\n",
    "                action = new_agent.act(state, epsilon)\n",
    "                next_state, reward, done, _, _ = env.step(action)\n",
    "                \n",
    "                total_reward += reward\n",
    "                buffer.append((state, action, reward, next_state, done))\n",
    "                \n",
    "                if len(buffer) >= batch_size:\n",
    "                    batch = random.sample(buffer, batch_size)\n",
    "                    new_agent.learn(batch, gamma)\n",
    "                \n",
    "                state = next_state\n",
    "                if done:\n",
    "                    break\n",
    "            if (episode + 1) % 10 == 0:\n",
    "                episode_reward = 0\n",
    "                for a in range(100):\n",
    "                    \n",
    "                    state, _ = env.reset(seed=a)  \n",
    "                    \n",
    "                    done = False\n",
    "            \n",
    "                    while not done:\n",
    "                        action = new_agent.act1(state, eps=0.0)  \n",
    "                        next_state, reward, terminated, truncated, _ = env.step(action)  \n",
    "                        done = terminated or truncated \n",
    "                        episode_reward += reward\n",
    "                        state = next_state\n",
    "                episode_reward=episode_reward/100\n",
    "                episode_rewards.append(episode_reward)\n",
    "           \n",
    "           \n",
    "        \n",
    "       \n",
    "        np.save(os.path.join(save_dir, f\"run_{run+1}_rewards1.npy\"),\n",
    "                np.array(episode_rewards))\n",
    "        all_rewards.append(episode_rewards)\n",
    "    \n",
    "   \n",
    "    all_rewards = np.array(all_rewards)\n",
    "    min_len = min(r.shape[0] for r in all_rewards)\n",
    "    all_rewards = all_rewards[:, :min_len]\n",
    "    mean_rewards = all_rewards.mean(axis=0)\n",
    "    std_rewards  = all_rewards.std(axis=0)\n",
    "    \n",
    "   \n",
    "    mean_s = moving_average(mean_rewards, window_size=1)\n",
    "    std_s  = moving_average(std_rewards,  window_size=1)\n",
    "    eps_s  = np.arange(len(mean_s))\n",
    "    \n",
    "   \n",
    "    plt.figure(figsize=(8,5))\n",
    "    plt.plot(eps_s, mean_s, label=f\"D={D}\")\n",
    "    plt.fill_between(eps_s,\n",
    "                     mean_s - std_s,\n",
    "                     mean_s + std_s,\n",
    "                     alpha=0.3)\n",
    "    plt.xlabel(\"Episode\")\n",
    "    plt.ylabel(\"Reward\")\n",
    "    plt.title(f\"Mean ± Std over {n_runs} Runs (D={D})\")\n",
    "    plt.legend()\n",
    "    plt.grid(True)\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "\n",
    "def moving_average(data, window_size=1):\n",
    "    return np.convolve(data, np.ones(window_size) / window_size, mode='valid')\n",
    "\n",
    "\n",
    "D_values = [0.0, 0.05, 0.1, 0.5]\n",
    "n_runs = 50\n",
    "data_root = \"training_rewards1\"\n",
    "window_size = 1  \n",
    "\n",
    "mean_curves = {}\n",
    "std_curves = {}\n",
    "min_common_len = None  \n",
    "\n",
    "for D in D_values:\n",
    "    dir_path = os.path.join(data_root, f\"D_{D}\")\n",
    "    all_rewards = []\n",
    "\n",
    "    for run in range(1, n_runs + 1):\n",
    "        file_path = os.path.join(dir_path, f\"run_{run}_rewards1.npy\")\n",
    "        if not os.path.exists(file_path):\n",
    "            continue\n",
    "        rewards = np.load(file_path)\n",
    "        all_rewards.append(rewards)\n",
    "\n",
    "    if not all_rewards:\n",
    "        continue\n",
    "\n",
    "   \n",
    "    min_len = min(len(r) for r in all_rewards)\n",
    "    if min_common_len is None:\n",
    "        min_common_len = min_len\n",
    "    else:\n",
    "        min_common_len = min(min_common_len, min_len)\n",
    "\n",
    "    all_rewards = np.array([r[:min_common_len] for r in all_rewards])\n",
    "    mean = all_rewards.mean(axis=0)\n",
    "    std = all_rewards.std(axis=0)\n",
    "\n",
    "   \n",
    "    mean_s = moving_average(mean, window_size)\n",
    "    std_s = moving_average(std, window_size)\n",
    "    mean_curves[D] = (mean_s, std_s)\n",
    "\n",
    "\n",
    "plt.figure(figsize=(6, 5)) \n",
    "x_vals = np.arange(10, 10 * (min_common_len + 1), 10)[:len(mean_s)]  \n",
    "\n",
    "for D in D_values:\n",
    "    if D not in mean_curves:\n",
    "        continue\n",
    "    mean_s, std_s = mean_curves[D]\n",
    "    plt.plot(x_vals, mean_s, label=fr\"$r_{{\\mathrm{{action}}}}={D}$\")\n",
    "    plt.fill_between(x_vals, mean_s - std_s, mean_s + std_s, alpha=0.3)\n",
    "\n",
    "plt.legend()\n",
    "plt.xlim(10,400)\n",
    "plt.grid(True)\n",
    "plt.tight_layout()\n",
    "plt.savefig(\"square_plot.png\", dpi=600)\n",
    "plt.show()\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
