{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "zUs3mScayFPO",
        "outputId": "01023f2e-7286-4b76-e022-daff34133082"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.11/dist-packages/gym/core.py:317: DeprecationWarning: \u001b[33mWARN: Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future.\u001b[0m\n",
            "  deprecation(\n",
            "/usr/local/lib/python3.11/dist-packages/gym/wrappers/step_api_compatibility.py:39: DeprecationWarning: \u001b[33mWARN: Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future.\u001b[0m\n",
            "  deprecation(\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Ep 50 | Reward: 65.0 | Cost: 2.30 | Actor Loss: 15.560\n",
            "Ep 100 | Reward: 46.0 | Cost: 0.63 | Actor Loss: 9.226\n",
            "Ep 150 | Reward: 141.0 | Cost: 123.05 | Actor Loss: 20.735\n",
            "Ep 200 | Reward: 178.0 | Cost: 93.10 | Actor Loss: 18.476\n",
            "Ep 250 | Reward: 154.0 | Cost: 51.07 | Actor Loss: 10.554\n",
            "Ep 300 | Reward: 500.0 | Cost: 243.23 | Actor Loss: 22.044\n",
            "Ep 350 | Reward: 181.0 | Cost: 98.64 | Actor Loss: 0.060\n",
            "Ep 400 | Reward: 412.0 | Cost: 399.40 | Actor Loss: 10.790\n",
            "Ep 450 | Reward: 388.0 | Cost: 310.05 | Actor Loss: 5.575\n",
            "Ep 500 | Reward: 500.0 | Cost: 235.20 | Actor Loss: 8.389\n",
            "Ep 550 | Reward: 500.0 | Cost: 355.78 | Actor Loss: 1.659\n",
            "Ep 600 | Reward: 500.0 | Cost: 307.41 | Actor Loss: 1.480\n",
            "Ep 650 | Reward: 500.0 | Cost: 209.70 | Actor Loss: 2.507\n",
            "Ep 700 | Reward: 500.0 | Cost: 380.38 | Actor Loss: 2.634\n",
            "Ep 750 | Reward: 500.0 | Cost: 192.33 | Actor Loss: 1.860\n",
            "Ep 800 | Reward: 500.0 | Cost: 283.12 | Actor Loss: -0.025\n",
            "Ep 850 | Reward: 500.0 | Cost: 101.84 | Actor Loss: 1.009\n",
            "Ep 900 | Reward: 500.0 | Cost: 91.87 | Actor Loss: 0.311\n",
            "Ep 950 | Reward: 500.0 | Cost: 358.96 | Actor Loss: 2.462\n",
            "Ep 1000 | Reward: 500.0 | Cost: 509.22 | Actor Loss: 3.695\n",
            "Training complete. Models saved.\n"
          ]
        }
      ],
      "source": [
        "import gym\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim\n",
        "from torch.distributions import Categorical\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "from IPython.display import display, clear_output\n",
        "import time\n",
        "import pandas as pd\n",
        "from copy import deepcopy\n",
        "\n",
        "# Compatibility fix for numpy\n",
        "if not hasattr(np, 'bool8'):\n",
        "    np.bool8 = np.bool_\n",
        "\n",
        "# ========== Hyperparameters ==========\n",
        "gamma = 0.99\n",
        "hidden_dim = 256\n",
        "learning_rate = 1e-3\n",
        "episodes = 1000\n",
        "lambda_fixed = 20  # Lagrange multiplier\n",
        "b = 200.0          # cost threshold buffer\n",
        "perturb_eps = 0    # Uniform noise for state perturbation\n",
        "delta = 0.05\n",
        "# ========== Environment ==========\n",
        "env = gym.make(\"CartPole-v1\")\n",
        "state_dim = env.observation_space.shape[0]\n",
        "action_dim = env.action_space.n\n",
        "\n",
        "# ========== Neural Networks ==========\n",
        "\n",
        "class Actor(nn.Module):\n",
        "    def __init__(self):\n",
        "        super().__init__()\n",
        "        self.model = nn.Sequential(\n",
        "            nn.Linear(state_dim, hidden_dim),\n",
        "            nn.ReLU(),\n",
        "            nn.Linear(hidden_dim, action_dim),\n",
        "            nn.Softmax(dim=-1)\n",
        "        )\n",
        "    def forward(self, state):\n",
        "        return self.model(state)\n",
        "\n",
        "class ValueCritic(nn.Module):\n",
        "    def __init__(self):\n",
        "        super().__init__()\n",
        "        self.model = nn.Sequential(\n",
        "            nn.Linear(state_dim, hidden_dim),\n",
        "            nn.ReLU(),\n",
        "            nn.Linear(hidden_dim, 1)\n",
        "        )\n",
        "    def forward(self, state):\n",
        "        return self.model(state)\n",
        "\n",
        "# ========== Networks & Optimizers ==========\n",
        "actor = Actor()\n",
        "reward_critic = ValueCritic()\n",
        "cost_critic = ValueCritic()\n",
        "\n",
        "actor_optim = optim.Adam(actor.parameters(), lr=learning_rate)\n",
        "reward_optim = optim.Adam(reward_critic.parameters(), lr=learning_rate)\n",
        "cost_optim = optim.Adam(cost_critic.parameters(), lr=learning_rate)\n",
        "\n",
        "# ========== Utility Functions ==========\n",
        "\n",
        "def add_uniform_noise(state, eps=0.05):\n",
        "    \"\"\"Uniform perturbation across each dimension of state.\"\"\"\n",
        "    #noise = np.random.uniform(0, eps, size=state.shape)\n",
        "    return state\n",
        "\n",
        "def discount(values, gamma=0.99):\n",
        "    result = []\n",
        "    G = 0\n",
        "    for v in reversed(values):\n",
        "        G = v + gamma * G\n",
        "        result.insert(0, G)\n",
        "    return torch.FloatTensor(result)\n",
        "\n",
        "# ========== Training Loop ==========\n",
        "\n",
        "dataF = {'cost': [], 'reward': []}\n",
        "last_50_actor_params = []  # To store actor weights for averaging\n",
        "best_reward = float('-inf')\n",
        "best_actor_state_dict = None\n",
        "\n",
        "start = time.time()\n",
        "for ep in range(episodes):\n",
        "    state = env.reset()\n",
        "    state = add_uniform_noise(np.array(state), perturb_eps)\n",
        "    state = torch.FloatTensor(state)\n",
        "\n",
        "    log_probs = []\n",
        "    rewards = []\n",
        "    costs = []\n",
        "    reward_values = []\n",
        "    cost_values = []\n",
        "\n",
        "    total_reward = 0\n",
        "    total_cost = 0\n",
        "    done = False\n",
        "\n",
        "    while not done:\n",
        "        probs = actor(state)\n",
        "        dist = Categorical(probs)\n",
        "        action = dist.sample()\n",
        "\n",
        "        next_state, reward, done, _ = env.step(action.item())\n",
        "\n",
        "        next_state = add_uniform_noise(np.array(next_state), perturb_eps)\n",
        "        next_state = torch.FloatTensor(next_state)\n",
        "\n",
        "        cost = abs(state[0].item())  # distance-based cost\n",
        "\n",
        "        # Save transitions\n",
        "        log_probs.append(dist.log_prob(action))\n",
        "        rewards.append(reward)\n",
        "        costs.append(cost)\n",
        "        reward_values.append(reward_critic(state))\n",
        "        cost_values.append(cost_critic(state))\n",
        "\n",
        "        total_reward += reward\n",
        "        total_cost += cost\n",
        "        state = next_state\n",
        "\n",
        "    # Discounted returns\n",
        "    reward_returns = discount(rewards, gamma)\n",
        "    cost_returns = discount(costs, gamma)\n",
        "\n",
        "    reward_values = torch.cat(reward_values).squeeze()\n",
        "    cost_values = torch.cat(cost_values).squeeze()\n",
        "    log_probs = torch.stack(log_probs)\n",
        "\n",
        "    adv_r = reward_returns - reward_values.detach()\n",
        "    adv_c = cost_returns - cost_values.detach()\n",
        "\n",
        "    chosen_adv = []\n",
        "    for vr, vc, ar, ac in zip(reward_returns, cost_returns, adv_r, adv_c):\n",
        "        if vc.item()<=b+delta:\n",
        "            chosen_adv.append(ar)\n",
        "        else:\n",
        "            chosen_adv.append(-ac)  # penalize constraint\n",
        "    chosen_adv = torch.stack(chosen_adv)\n",
        "\n",
        "    # ===== Losses =====\n",
        "    actor_loss = -(log_probs * chosen_adv).mean()\n",
        "    reward_loss = nn.functional.mse_loss(reward_values, reward_returns)\n",
        "    cost_loss = nn.functional.mse_loss(cost_values, cost_returns)\n",
        "\n",
        "    # ===== Backprop =====\n",
        "    actor_optim.zero_grad()\n",
        "    actor_loss.backward()\n",
        "    actor_optim.step()\n",
        "\n",
        "    reward_optim.zero_grad()\n",
        "    reward_loss.backward()\n",
        "    reward_optim.step()\n",
        "\n",
        "    cost_optim.zero_grad()\n",
        "    cost_loss.backward()\n",
        "    cost_optim.step()\n",
        "\n",
        "    dataF['cost'].append(total_cost)\n",
        "    dataF['reward'].append(total_reward)\n",
        "\n",
        "    # === Store weights for last 50 actor policies ===\n",
        "    if len(last_50_actor_params) >= 50:\n",
        "        last_50_actor_params.pop(0)\n",
        "    last_50_actor_params.append(deepcopy(actor.state_dict()))\n",
        "\n",
        "    if total_cost < b and total_reward > best_reward:\n",
        "        best_reward = total_reward\n",
        "        best_actor_state_dict = deepcopy(actor.state_dict())\n",
        "\n",
        "    # Print progress\n",
        "    if (ep + 1) % 50 == 0:\n",
        "        print(f\"Ep {ep+1} | Reward: {total_reward:.1f} | Cost: {total_cost:.2f} | Actor Loss: {actor_loss.item():.3f}\")\n",
        "\n",
        "# ========== After Training ==========\n",
        "\n",
        "# Save data\n",
        "env.close()\n",
        "df = pd.DataFrame(dataF)\n",
        "df.to_excel('tvf_and_tcf_data_crpo.xlsx')\n",
        "\n",
        "# Save final models\n",
        "torch.save(actor.state_dict(), 'actor_crpo.pth')\n",
        "torch.save(reward_critic.state_dict(), 'reward_critic_crpo.pth')\n",
        "torch.save(cost_critic.state_dict(), 'cost_critic_crpo.pth')\n",
        "\n",
        "# === Average Last 50 Actor Policies ===\n",
        "avg_actor_state_dict = deepcopy(last_50_actor_params[0])\n",
        "for key in avg_actor_state_dict.keys():\n",
        "    for i in range(1, len(last_50_actor_params)):\n",
        "        avg_actor_state_dict[key] += last_50_actor_params[i][key]\n",
        "    avg_actor_state_dict[key] /= len(last_50_actor_params)\n",
        "\n",
        "avg_actor = Actor()\n",
        "avg_actor.load_state_dict(avg_actor_state_dict)\n",
        "torch.save(avg_actor.state_dict(), 'actor_avg_last50_crpo.pth')\n",
        "\n",
        "# Save best actor if available\n",
        "if best_actor_state_dict is not None:\n",
        "    best_actor = Actor()\n",
        "    best_actor.load_state_dict(best_actor_state_dict)\n",
        "    torch.save(best_actor.state_dict(), 'actor_best_crpo.pth')\n",
        "print(\"Time taken\", time.time()-start)\n",
        "print(\"Training complete. Models saved.\")"
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "XEjBQ6DcyKpX"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}