{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd0b4cc0",
   "metadata": {},
   "outputs": [],
   "source": [
    "#!/usr/bin/env python\n",
    "import numpy as np\n",
    "import random\n",
    "from tqdm import tqdm\n",
    "import pickle\n",
    "\n",
    "np.set_printoptions(precision=4, suppress=True)\n",
    "\n",
    "# ------------------------------\n",
    "# Q-Learning Code Implementation\n",
    "# ------------------------------\n",
    "class GridworldEnv:\n",
    "    def __init__(self, grid_size=(4, 4)):\n",
    "        self.rows, self.cols = grid_size\n",
    "        # Define special states with forced destination and reward.\n",
    "        self.special_states = {\n",
    "            (0, 1): {'dest': (3, 1), 'reward': 10},\n",
    "            (0, 3): {'dest': (2, 3), 'reward': 5}\n",
    "        }\n",
    "        # Actions: 0: left, 1: up, 2: right, 3: down.\n",
    "        self.actions = {\n",
    "            0: (0, -1),\n",
    "            1: (-1, 0),\n",
    "            2: (0, 1),\n",
    "            3: (1, 0)\n",
    "        }\n",
    "        self.action_list = [0, 1, 2, 3]\n",
    "        self.intended_prob = 0.9\n",
    "        self.slip_prob = 0.05  # probability for each perpendicular move\n",
    "\n",
    "    def in_bounds(self, state):\n",
    "        r, c = state\n",
    "        return 0 <= r < self.rows and 0 <= c < self.cols\n",
    "\n",
    "    def get_perpendicular_actions(self, action):\n",
    "        if action in [0, 2]:\n",
    "            return [1, 3]\n",
    "        elif action in [1, 3]:\n",
    "            return [0, 2]\n",
    "        else:\n",
    "            return []\n",
    "\n",
    "    def get_outcomes(self, state, action):\n",
    "        \"\"\"\n",
    "        Computes all possible outcomes for a given state and action.\n",
    "        Returns a list of tuples: (probability, next_state, reward)\n",
    "        following the dynamics of the gridworld.\n",
    "        \"\"\"\n",
    "        if state in self.special_states:\n",
    "            forced_dest = self.special_states[state]['dest']\n",
    "            reward = self.special_states[state]['reward']\n",
    "            return [(1.0, forced_dest, reward)]\n",
    "\n",
    "        outcomes = []\n",
    "        intended_action = action\n",
    "        base_reward = 0\n",
    "        probs = [self.intended_prob, self.slip_prob, self.slip_prob]\n",
    "        actions_to_consider = [intended_action] + self.get_perpendicular_actions(intended_action)\n",
    "        for act, p in zip(actions_to_consider, probs):\n",
    "            dr, dc = self.actions[act]\n",
    "            next_state = (state[0] + dr, state[1] + dc)\n",
    "            if not self.in_bounds(next_state):\n",
    "                outcomes.append((p, state, -1))\n",
    "            else:\n",
    "                outcomes.append((p, next_state, base_reward))\n",
    "        return outcomes\n",
    "\n",
    "def compute_optimal_Q(env, gamma=0.9, tol=1e-9, max_iter=10000):\n",
    "    \"\"\"\n",
    "    Compute the optimal Q-function Q* using value iteration.\n",
    "    Returns Q* as a (rows x cols x num_actions) numpy array.\n",
    "    \"\"\"\n",
    "    Q = np.zeros((env.rows, env.cols, len(env.action_list)))\n",
    "    for it in range(max_iter):\n",
    "        Q_new = np.zeros_like(Q)\n",
    "        for r in range(env.rows):\n",
    "            for c in range(env.cols):\n",
    "                state = (r, c)\n",
    "                for a in env.action_list:\n",
    "                    if state in env.special_states:\n",
    "                        forced_dest = env.special_states[state]['dest']\n",
    "                        reward = env.special_states[state]['reward']\n",
    "                        Q_new[r, c, a] = reward + gamma * np.max(Q[forced_dest[0], forced_dest[1]])\n",
    "                    else:\n",
    "                        probs = [env.intended_prob, env.slip_prob, env.slip_prob]\n",
    "                        actions_to_consider = [a] + env.get_perpendicular_actions(a)\n",
    "                        value = 0.0\n",
    "                        for p, act in zip(probs, actions_to_consider):\n",
    "                            dr, dc = env.actions[act]\n",
    "                            next_state = (r + dr, c + dc)\n",
    "                            if not env.in_bounds(next_state):\n",
    "                                value += p * (-1 + gamma * np.max(Q[r, c]))\n",
    "                            else:\n",
    "                                value += p * (0 + gamma * np.max(Q[next_state[0], next_state[1]]))\n",
    "                        Q_new[r, c, a] = value\n",
    "        err = np.max(np.abs(Q_new - Q))\n",
    "        Q = Q_new\n",
    "        if err < tol:\n",
    "            print(f\"Value iteration converged in {it+1} iterations with error {err:.2e}.\")\n",
    "            break\n",
    "    return Q\n",
    "\n",
    "class QLearningAgent:\n",
    "    def __init__(self, env, alpha_constant=0.1,alpha_diminishing=0.5, gamma=0.9):\n",
    "        self.env = env\n",
    "        self.alpha_constant = alpha_constant      # learning rate\n",
    "        self.alpha_diminishing = alpha_diminishing \n",
    "        self.gamma = gamma      # discount factor\n",
    "        #self.epsilon = epsilon  # exploration rate\n",
    "        self.Q = np.zeros((env.rows, env.cols, len(env.action_list)))\n",
    "\n",
    "def run_q_learning_LD2Z(Q_star, steps=2, gamma=0.7, eta=0.65, power=1):\n",
    "    \n",
    "    env = GridworldEnv()\n",
    "    agent = QLearningAgent(env, alpha_constant=alpha_constant, alpha_diminishing=alpha_diminishing, gamma=gamma)\n",
    "    #Q_star = compute_optimal_Q(env, gamma=gamma)\n",
    "    Q_traj = agent.Q.copy() \n",
    "    #print(Q_traj)\n",
    "    # List to store copies of the Q-function.\n",
    "    branch_probs = [env.intended_prob, env.slip_prob, env.slip_prob]\n",
    "    error_history = []   # To record the sup-norm error after each episode\n",
    "        \n",
    "    # Store Q-differences for each step to compute the desired maximum later\n",
    "    Q_differences = []\n",
    "    \n",
    "    for t in tqdm(range(steps), desc=\"Linearly decaying steps\", ncols=100, position=0):\n",
    "        branch = np.random.choice([0, 1, 2], p=branch_probs)\n",
    "        for r in range(env.rows):\n",
    "            for c in range(env.cols):\n",
    "                s = (r, c)\n",
    "                for a in env.action_list:\n",
    "                    if s in env.special_states:\n",
    "                        forced_dest = env.special_states[s]['dest']\n",
    "                        reward = env.special_states[s]['reward']\n",
    "                        bellman_op = reward + gamma * np.max(Q_traj[forced_dest[0], forced_dest[1]])\n",
    "                    else:\n",
    "                        outcomes = env.get_outcomes(s, a)\n",
    "                        _, s_next, r_val = outcomes[branch]\n",
    "                        bellman_op = r_val + gamma * np.max(Q_traj[s_next[0], s_next[1]])\n",
    "                    \n",
    "                    lr = eta * ((1- t/steps)** power)\n",
    "                    agent.Q[r, c, a] = (1 - lr) * agent.Q[r, c, a] + lr * bellman_op\n",
    "        Q_traj = agent.Q.copy()\n",
    "        #print(Q_traj)\n",
    "        error_history.append(np.max(np.abs(Q_traj - Q_star)))\n",
    "        \n",
    "        # Store the difference Q_traj - Q_star for this step\n",
    "        Q_differences.append(Q_traj - Q_star)\n",
    "    \n",
    "    # Compute the desired maximum: max_{k_n <= t <= n} |sum_{l=t}^n (Q_l - Q_star)|\n",
    "    n = steps\n",
    "    k_n = n - int(np.sqrt(n))\n",
    "    \n",
    "    max_norm = 0.0\n",
    "    for t in range(k_n, n):\n",
    "        # Compute sum from l=t to n-1 (since we use 0-indexing)\n",
    "        partial_sum = np.zeros_like(Q_star)\n",
    "        for l in range(t, n):\n",
    "            partial_sum += Q_differences[l]\n",
    "        \n",
    "        current_norm = np.max(np.abs(partial_sum))\n",
    "        if current_norm > max_norm:\n",
    "            max_norm = current_norm\n",
    "                    \n",
    "    return agent, env, max_norm, error_history, Q_traj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc8aaf2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from joblib import Parallel, delayed\n",
    "from tqdm import tqdm\n",
    "from tqdm_joblib import tqdm_joblib\n",
    "\n",
    "def run_single_q_learning(seed, Q_star, steps, gamma, eta, power):\n",
    "    np.random.seed(seed)\n",
    "    random.seed(seed)\n",
    "    _, _, max_norm, error_history, Q_traj = run_q_learning_LD2Z(Q_star = Q_star,\n",
    "        steps = steps,\n",
    "        gamma=gamma,\n",
    "        eta=eta,\n",
    "        power=power                                                        \n",
    "    )\n",
    "    return max_norm, error_history, Q_traj\n",
    "\n",
    "def run_experiments_parallel(B, Q_star, steps, gamma, eta, power,\n",
    "                              n_jobs=5):\n",
    "    seeds = list(range(B))\n",
    "    with tqdm_joblib(tqdm(total=B, desc=\"Running experiments\", ncols=100)):\n",
    "        results = Parallel(n_jobs=n_jobs)(\n",
    "            delayed(run_single_q_learning)(seed, Q_star, steps, gamma, eta, power)\n",
    "            for seed in seeds\n",
    "        )\n",
    "    return results\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b272593a",
   "metadata": {},
   "outputs": [],
   "source": [
    "if __name__ == \"__main__\":\n",
    "    # Experiment parameters.\n",
    "    B = 500\n",
    "    steps= 5000\n",
    "    gamma = 0.1\n",
    "    eta = 0.05\n",
    "    power = 1\n",
    "    alpha_constant = 0.05\n",
    "    alpha_diminishing = 0.05\n",
    "    ###### Compute Q^star\n",
    "\n",
    "    env = GridworldEnv()\n",
    "    agent = QLearningAgent(env, alpha_constant=alpha_constant, alpha_diminishing=alpha_diminishing, gamma=gamma)\n",
    "    Q_star = compute_optimal_Q(env, gamma=gamma)\n",
    "\n",
    "    results = run_experiments_parallel(B, Q_star, steps, gamma, eta, power, n_jobs=5)\n",
    "\n",
    "    # Unzip the results into separate lists\n",
    "    max_norm_list, all_error_history, final_Q_list = zip(*results)  # This gives tuples\n",
    "    max_norm_list = list(max_norm_list)\n",
    "    all_error_history = list(all_error_history)\n",
    "    final_Q_list = list(final_Q_list)\n",
    "\n",
    "    # Save Q_traj\n",
    "    with open(\"max_norm_list.pkl\", \"wb\") as f:\n",
    "        pickle.dump(max_norm_list, f)\n",
    "    print(f\"Saved max_norm_list.pkl with {len(max_norm_list)} experiments.\")\n",
    "    \n",
    "    with open(\"final_Q_list.pkl\", \"wb\") as f:\n",
    "        pickle.dump(final_Q_list, f)\n",
    "    print(f\"Saved final_Q_list.pkl with {len(final_Q_list)} experiments.\")\n",
    "\n",
    "\n",
    "    # Save error history\n",
    "    with open(\"all_error_history.pkl\", \"wb\") as f:\n",
    "        pickle.dump(all_error_history, f)\n",
    "    print(f\"Saved all_error_history.pkl with {len(all_error_history)} experiments.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41016779",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import random\n",
    "import pickle\n",
    "\n",
    "# Load saved data\n",
    "with open(\"all_error_history.pkl\", \"rb\") as f:\n",
    "    all_error_history = pickle.load(f)\n",
    "\n",
    "with open(\"final_Q_list.pkl\", \"rb\") as f:\n",
    "    final_Q_list = pickle.load(f)\n",
    "\n",
    "# Load environment and optimal Q*\n",
    "env = GridworldEnv()\n",
    "Q_star = compute_optimal_Q(env, gamma=gamma)\n",
    "\n",
    "# Randomly select one run\n",
    "idx = random.randint(0, len(all_error_history) - 1)\n",
    "error_history = all_error_history[idx]\n",
    "final_Q = final_Q_list[idx]\n",
    "\n",
    "# Plot the error curve over episodes\n",
    "plt.figure(figsize=(8, 5))\n",
    "plt.plot(range(len(error_history)), error_history, label='Max-Norm Error |Q - Q*|')\n",
    "plt.xlabel(\"Episode\")\n",
    "plt.ylabel(\"Error\")\n",
    "plt.title(f\"Error between Q-learning iterate and Q* (Run #{idx})\")\n",
    "plt.legend()\n",
    "plt.grid(True)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a016bc6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Helper to print Q-values\n",
    "def print_q_values(Q, title=\"Q-values\"):\n",
    "    print(title)\n",
    "    for i in range(Q.shape[0]):\n",
    "        for j in range(Q.shape[1]):\n",
    "            q_vals = Q[i, j]\n",
    "            print(f\"State ({i},{j}): {q_vals}\")\n",
    "        print()\n",
    "\n",
    "# Print learned and optimal Q-values\n",
    "print_q_values(final_Q, \"Learned Q-values (final)\")\n",
    "print_q_values(Q_star, \"Optimal Q* (from value iteration)\")\n",
    "\n",
    "# Print derived greedy policy\n",
    "def print_policy(Q, env):\n",
    "    action_names = {0: \"←\", 1: \"↑\", 2: \"→\", 3: \"↓\"}\n",
    "    policy = np.full((env.rows, env.cols), \" \")\n",
    "    for i in range(env.rows):\n",
    "        for j in range(env.cols):\n",
    "            state = (i, j)\n",
    "            if state in env.special_states:\n",
    "                policy[i, j] = \"F\"  # Forced transition\n",
    "            else:\n",
    "                best_action = int(np.argmax(Q[i, j]))\n",
    "                policy[i, j] = action_names[best_action]\n",
    "    print(\"Derived Greedy Policy:\")\n",
    "    print(policy)\n",
    "\n",
    "print_policy(final_Q, env)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f93d54b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "###### Compute Q^star\n",
    "\n",
    "env = GridworldEnv()\n",
    "agent = QLearningAgent(env, alpha_constant=alpha_constant, alpha_diminishing=alpha_diminishing, gamma=gamma)\n",
    "Q_star = compute_optimal_Q(env, gamma=gamma)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "300e17b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"max_norm_list.pkl\", \"rb\") as f:\n",
    "    max_norm_list = pickle.load(f)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3b4a2022",
   "metadata": {},
   "source": [
    "# now performing bootstrap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "df0c2735",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "199b70df",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6d9f17e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def state_action_index(state, action, env):\n",
    "    \"\"\"\n",
    "    Map a (state, action) pair to a unique index.\n",
    "    Assumes state is a tuple (row, col) and env.action_list gives the ordering of actions.\n",
    "    \"\"\"\n",
    "    r, c = state\n",
    "    return (r * env.cols + c) * len(env.action_list) + action\n",
    "\n",
    "\n",
    "def simulate_Bhat_samples(env, Q_star, gamma=0.9, T=10):\n",
    "    \"\"\"\n",
    "    Simulate T independent samples of the Bellman operator \\hat{B}(Q_star)\n",
    "    for every state–action pair, taking into account the special states.\n",
    "    \n",
    "    For each state–action pair (s,a):\n",
    "      - If s is special, then the outcome is forced:\n",
    "            sample = r(s,a) + gamma * max_{a'} Q_star(forced_dest, a')\n",
    "        where forced_dest and r(s,a) are given by env.special_states.\n",
    "      - Otherwise, sample a branch (with probabilities [0.9, 0.05, 0.05]) using \n",
    "            outcomes = env.get_outcomes(s, a)\n",
    "        and compute:\n",
    "            sample = r + gamma * max_{a'} Q_star(s_next, a')\n",
    "    \n",
    "    Returns:\n",
    "      samples: a (T x D) numpy array, where D = |S|*|A|.\n",
    "    \"\"\"\n",
    "    env = GridworldEnv()\n",
    "    D = env.rows * env.cols * len(env.action_list)\n",
    "    samples = np.zeros((T, D))\n",
    "    branch_probs = [env.intended_prob, env.slip_prob, env.slip_prob]\n",
    "    \n",
    "    for t in tqdm(range(T), desc=\"sampling bellman noise\", ncols=100, position=0):\n",
    "        sample_vec = np.zeros(D)\n",
    "        branch = np.random.choice([0, 1, 2], p=branch_probs)\n",
    "        for r in range(env.rows):\n",
    "            for c in range(env.cols):\n",
    "                s = (r, c)\n",
    "                for a in env.action_list:\n",
    "                    idx = state_action_index(s, a, env)\n",
    "                    if s in env.special_states:\n",
    "                        # Special state: use forced destination and reward.\n",
    "                        forced_dest = env.special_states[s]['dest']\n",
    "                        reward = env.special_states[s]['reward']\n",
    "                        sample_vec[idx] = reward + gamma * np.max(Q_star[forced_dest[0], forced_dest[1]])\n",
    "                    else:\n",
    "                        # Non-special state: sample a branch.\n",
    "                        outcomes = env.get_outcomes(s, a)\n",
    "                        _, s_next, r_val = outcomes[branch]\n",
    "                        sample_vec[idx] = r_val + gamma * np.max(Q_star[s_next[0], s_next[1]])\n",
    "                        \n",
    "        samples[t, :] = sample_vec\n",
    "    return samples\n",
    "\n",
    "B_vec = simulate_Bhat_samples(env, Q_star, gamma=gamma, T=10000)\n",
    "#B_vec = simulate_Bhat_samples(env, agent.Q, T=10000)\n",
    "Z_vec=B_vec-np.mean(B_vec, axis=0)\n",
    "Gamma_est=np.cov(Z_vec, rowvar=False)\n",
    "np.linalg.eigvals(Gamma_est)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "01d3a457",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_P(env):\n",
    "    \"\"\"\n",
    "    Compute the transition matrix 𝒫 ∈ ℝ^(D x |S|) where D = |S|*|A|.\n",
    "    For each state–action pair (s, a), the row in 𝒫 gives the probability distribution\n",
    "    over next states.\n",
    "    \"\"\"\n",
    "    S = env.rows * env.cols\n",
    "    A = len(env.action_list)\n",
    "    D = S * A\n",
    "    P = np.zeros((D, S))\n",
    "    for s in range(S):\n",
    "        r_idx = s // env.cols\n",
    "        c_idx = s % env.cols\n",
    "        state = (r_idx, c_idx)\n",
    "        for a in env.action_list:\n",
    "            row = s * A + a\n",
    "            outcomes = env.get_outcomes(state, a)\n",
    "            for (p, s_next, _) in outcomes:\n",
    "                s_next_index = s_next[0] * env.cols + s_next[1]\n",
    "                P[row, s_next_index] += p\n",
    "    return P\n",
    "\n",
    "def compute_Pi(pi):\n",
    "    \"\"\"\n",
    "    Given a policy matrix pi ∈ ℝ^(|S| x |A|) (one row per state),\n",
    "    construct the projection matrix Π^π ∈ ℝ^(|S| x D) where D = |S|*|A|.\n",
    "    Each row s of Π^π is a one-hot vector corresponding to the greedy action.\n",
    "    \"\"\"\n",
    "    S, A = pi.shape\n",
    "    D = S * A\n",
    "    Pi = np.zeros((S, D))\n",
    "    for s in range(S):\n",
    "        start = s * A\n",
    "        end = (s + 1) * A\n",
    "        Pi[s, start:end] = pi[s]\n",
    "    return Pi\n",
    "\n",
    "\n",
    "def compute_H_and_Sigma (Q_star, env, Gamma_star):\n",
    "    S = env.rows * env.cols\n",
    "    A = len(env.action_list)\n",
    "    D = S * A\n",
    "    I = np.eye(D)\n",
    "    \n",
    "    # Compute the transition matrix (assumed known).\n",
    "    P = compute_P(env)  # shape (D, S)\n",
    "    \n",
    "    Q_star = Q_star.reshape((env.rows, env.cols, A))\n",
    "    \n",
    "    # Build policy matrix: shape (S, A).\n",
    "    pi = np.zeros((S, A))\n",
    "    for s in range(S):\n",
    "        r_idx = s // env.cols\n",
    "        c_idx = s % env.cols\n",
    "        best_a = np.argmax(Q_star[r_idx, c_idx])\n",
    "        pi[s, best_a] = 1.0\n",
    "    # Compute Π^π.\n",
    "    Pi = compute_Pi(pi)  # shape (S, D)\n",
    "    # Compute H^{π} = 𝒫 Π^π. (𝒫: D x S, Π^π: S x D, so H: D x D)\n",
    "    H = P @ Pi\n",
    "\n",
    "    # Build A = I - gamma * H\n",
    "    A = I - gamma * H\n",
    "    M     = np.linalg.solve(A, Gamma_star.T)\n",
    "    # step 2: solve A · Σ = M^T          →  Σ = A^{-1} (Γ^* A^{-T})\n",
    "    Sigma = np.linalg.solve(A, M.T)\n",
    "    return H, Sigma\n",
    "\n",
    "\n",
    "H_star, Sigma_star = compute_H_and_Sigma(Q_star, env, Gamma_est)\n",
    "\n",
    "#Sigma_cholesky = np.linalg.cholesky(Sigma_star) \n",
    "#Gamma_cholesky = np.linalg.cholesky(Gamma_est)\n",
    "\n",
    "def simulate_Y(H, Q_star , env, Gamma_star, steps, gamma, eta, power):\n",
    "    \"\"\"\n",
    "    Simulate the process:\n",
    "         Y_t = (I - eta_t (I - gamma * H^{π_{t-1}})) Y_{t-1} + eta_t * Z_t,\n",
    "    where H^{π_{t-1}} = 𝒫 Π^{π_{t-1}}, and the greedy policy π_{t-1} is computed\n",
    "    from Q_traj[t-1].\n",
    "    \n",
    "    Inputs:\n",
    "      - Q_traj: list of Q-learning iterates (each Q is an array of shape (rows, cols, |A|)),\n",
    "                corresponding to the diminishing phase.\n",
    "      - env: the gridworld environment.\n",
    "      - Gamma_star: the (D x D) noise covariance matrix.\n",
    "      - alpha, beta: parameters for the step-size, eta_t = alpha * t^(-beta).\n",
    "      - gamma: discount factor.\n",
    "    \n",
    "    Returns:\n",
    "      - Y: the final Y vector (D-dimensional),\n",
    "      - Y_traj: list of Y vectors at each update.\n",
    "    \"\"\"\n",
    "    S = env.rows * env.cols\n",
    "    A = len(env.action_list)\n",
    "    D = S * A\n",
    "    I = np.eye(D)\n",
    "    Y = np.zeros(D)   # Initialize Y_0 = 0.\n",
    "    Y_traj = Y.copy()\n",
    "    Y_history = []\n",
    "    \n",
    "    # Store Y trajectories for each step to compute the desired maximum later\n",
    "    Y_trajectories = []\n",
    "        \n",
    "    #Q_traj= Q_traj[(constant_steps+1):(constant_steps+diminishing_steps)]\n",
    "      \n",
    "    for t in tqdm(range(steps), desc=\"bootstrap: steps\", ncols=100, position=0):\n",
    "        lr = eta * (1-t/steps)**(power)\n",
    "        # Sample Z_t ~ N(0, Gamma_star)\n",
    "        Z_t = np.random.multivariate_normal(np.zeros(D), Gamma_star)\n",
    "        # Update Y.\n",
    "        Y = (I - lr * (I - gamma * H)) @ Y_traj + lr * Z_t\n",
    "        Y_traj=Y.copy()\n",
    "        \n",
    "        # Store the Y trajectory for this step\n",
    "        Y_trajectories.append(Y_traj.copy())\n",
    "        \n",
    "        Y_history.append(np.max(np.abs(Y_traj)))\n",
    "    \n",
    "    # Compute the desired maximum: max_{k_n <= t <= n} |sum_{l=t}^n Y_l|\n",
    "    n = steps\n",
    "    k_n = n - int(np.sqrt(n))\n",
    "    \n",
    "    max_norm = 0.0\n",
    "    for t in range(k_n, n):\n",
    "        # Compute sum from l=t to n-1 (since we use 0-indexing)\n",
    "        partial_sum = np.zeros_like(Y_traj)\n",
    "        for l in range(t, n):\n",
    "            partial_sum += Y_trajectories[l]\n",
    "        \n",
    "        current_norm = np.max(np.abs(partial_sum))\n",
    "        if current_norm > max_norm:\n",
    "            max_norm = current_norm\n",
    "        \n",
    "        \n",
    "    return  Y_traj, Y_history, max_norm\n",
    "\n",
    "# Example usage:\n",
    "# Assume that Q_traj is a list of Q arrays from your diminishing-phase Q-learning,\n",
    "# env is your GridworldEnv instance, and Gamma_star is the noise covariance matrix.\n",
    "# For example:\n",
    "# Y_final, Y_traj = simulate_Y_from_Qtraj(Q_traj, env, Gamma_star, alpha=0.4, beta=0.8, gamma=0.9)\n",
    "\n",
    "\n",
    "#Y_sim, Y_sim_traj, Y_history = simulate_Y(Q_traj, env, Gamma_est, constant_steps=0, diminishing_steps=10000)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "549bd6d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "from joblib import Parallel, delayed\n",
    "from tqdm import tqdm\n",
    "from tqdm_joblib import tqdm_joblib\n",
    "\n",
    "def simulate_Y_from_single_Q(H, Q_star, env, Gamma_star, steps, gamma, eta, power):\n",
    "    \"\"\"\n",
    "    Run simulate_Y on a fixed Q_traj (randomness comes only from noise).\n",
    "    \"\"\"\n",
    "    return simulate_Y(H, Q_star, env, Gamma_star,\n",
    "                      steps, gamma, eta, power)\n",
    "\n",
    "def run_Y_simulations_on_one_Q(H, Q_star, env, Gamma_star,\n",
    "                               steps, gamma, eta, power, B=100,\n",
    "                               n_jobs=4):\n",
    "    \"\"\"\n",
    "    Run simulate_Y B times using the last Q_traj in all_Q_traj.\n",
    "    Each run uses different noise due to sampling from N(0, Gamma_star).\n",
    "    \"\"\"\n",
    "    #Q_traj_sample = all_Q_traj[-1]  # Use the last Q_traj\n",
    "\n",
    "    with tqdm_joblib(tqdm(total=B, desc=\"Simulating Y on last Q_traj\", ncols=100)):\n",
    "        results = Parallel(n_jobs=n_jobs)(\n",
    "            delayed(simulate_Y_from_single_Q)(\n",
    "                H,\n",
    "                Q_star, env, Gamma_star,\n",
    "                steps, gamma, eta, power\n",
    "            )\n",
    "            for _ in range(B)\n",
    "        )\n",
    "    return results  # List of (Y_final, Y_traj, Y_history)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea6045ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "Y_results = run_Y_simulations_on_one_Q(\n",
    "    H_star,\n",
    "    Q_star,\n",
    "    env,\n",
    "    Gamma_star=Gamma_est,\n",
    "    steps=steps, gamma=gamma, eta=eta, power=power,B=B,\n",
    "    n_jobs=5\n",
    ")\n",
    "\n",
    "# Unpack for plotting or analysis\n",
    "Y_trajs, Y_histories, max_norms_Y = zip(*Y_results)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69c84ddc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import random\n",
    "import pickle\n",
    "\n",
    "# Load saved data\n",
    "with open(\"Y_histories.pkl\", \"wb\") as f:\n",
    "    pickle.dump(Y_histories, f)\n",
    "\n",
    "with open(\"Y_histories.pkl\", \"rb\") as f:\n",
    "    Y_histories = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20767254",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"max_norms_Y.pkl\", \"wb\") as f:\n",
    "    pickle.dump(max_norms_Y, f)\n",
    "\n",
    "with open(\"max_norms_Y.pkl\", \"rb\") as f:\n",
    "    max_norms_Y = pickle.load(f)\n",
    "    \n",
    "#max_norms_Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06d4d303",
   "metadata": {},
   "outputs": [],
   "source": [
    "# -------------\n",
    "# NeurIPS-style Q–Q plot (sup-norms): Q vs Y\n",
    "# -------------\n",
    "\n",
    "import matplotlib as mpl\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "# ---- style to match your second snippet ----\n",
    "mpl.rcParams.update({\n",
    "    \"figure.dpi\": 150,\n",
    "    \"savefig.dpi\": 300,\n",
    "    \"font.family\": \"serif\",\n",
    "    \"font.serif\": [\"STIX Two Text\", \"Times New Roman\", \"DejaVu Serif\", \"STIXGeneral\", \"CMU Serif\"],\n",
    "    \"mathtext.fontset\": \"stix\",\n",
    "    \"axes.spines.left\": True,\n",
    "    \"axes.spines.bottom\": True,\n",
    "    \"axes.spines.right\": False,\n",
    "    \"axes.spines.top\": False,\n",
    "    \"axes.linewidth\": 1.1,\n",
    "    \"axes.titlesize\": 13,\n",
    "    \"axes.labelsize\": 12,\n",
    "    \"xtick.labelsize\": 11,\n",
    "    \"ytick.labelsize\": 11,\n",
    "    \"legend.frameon\": False,\n",
    "    \"legend.fontsize\": 10,\n",
    "    \"axes.grid\": True,\n",
    "    \"grid.alpha\": 0.25,\n",
    "    \"grid.linestyle\": \"--\",\n",
    "    \"grid.linewidth\": 0.6,\n",
    "    \"xtick.major.size\": 4,\n",
    "    \"ytick.major.size\": 4,\n",
    "    \"xtick.minor.size\": 2.5,\n",
    "    \"ytick.minor.size\": 2.5,\n",
    "    \"xtick.direction\": \"in\",\n",
    "    \"ytick.direction\": \"in\",\n",
    "    \"figure.autolayout\": False,  # we'll call tight_layout explicitly\n",
    "})\n",
    "\n",
    "def qq_match(a, b, qmin=0.01, qmax=0.99):\n",
    "    \"\"\"\n",
    "    Quantile-match two 1D arrays (handles different lengths & non-finites).\n",
    "    Returns (xq, yq) with shared quantiles in (qmin, qmax).\n",
    "    \"\"\"\n",
    "    a = np.asarray(a).ravel()\n",
    "    b = np.asarray(b).ravel()\n",
    "    a = a[np.isfinite(a)]\n",
    "    b = b[np.isfinite(b)]\n",
    "    n = min(len(a), len(b))\n",
    "    if n == 0:\n",
    "        raise ValueError(\"Empty input after filtering non-finite values.\")\n",
    "    qs = np.linspace(qmin, qmax, n)\n",
    "    return np.quantile(a, qs), np.quantile(b, qs)\n",
    "\n",
    "# --- your existing arrays (must exist in the namespace) ---\n",
    "# max_norms_Y, max_norm_list\n",
    "\n",
    "supnorm_Y = np.array(max_norms_Y)\n",
    "supnorm_Q = np.array(max_norm_list)\n",
    "\n",
    "# shared-quantile pairs (x=Q, y=Y)\n",
    "xq, yq = qq_match(supnorm_Q, supnorm_Y)\n",
    "\n",
    "# limits with small symmetric padding; enforce equal scale/aspect\n",
    "mn = float(min(xq.min(), yq.min()))\n",
    "mx = float(max(xq.max(), yq.max()))\n",
    "pad = 0.02 * (mx - mn if mx > mn else 1.0)\n",
    "lo, hi = mn - pad, mx + pad\n",
    "\n",
    "# -------------\n",
    "# figure\n",
    "# -------------\n",
    "fig = plt.figure(figsize=(7.5, 4.8))\n",
    "ax = plt.gca()\n",
    "\n",
    "# scatter: marker-only for Q–Q points\n",
    "ax.plot(xq, yq, 'o', ms=4.0, alpha=0.95, label=\"Gaussian approximation\")\n",
    "\n",
    "# 45° reference\n",
    "ax.plot([lo, hi], [lo, hi], '--', linewidth=1.2, label=r\"$y=x$\")\n",
    "\n",
    "# labels (title optional per NeurIPS style)\n",
    "ax.set_xlabel(r\"Quantiles of $\\max_{k_n \\leq l \\leq n}\\|\\sum_{t=l}^n (\\boldsymbol{Q}_t - \\boldsymbol{Q}^\\star)\\|_\\infty$\")\n",
    "ax.set_ylabel(r\"Quantiles of $\\max_{k_n \\leq l \\leq n}\\|\\sum_{t=l}^n Y_t\\|_\\infty$\")\n",
    "# ax.set_title(\"Q–Q Plot (Sup-norms)\")  # usually omit titles in NeurIPS figures\n",
    "\n",
    "ax.set_xlim(lo, hi)\n",
    "ax.set_ylim(lo, hi)\n",
    "ax.set_aspect('equal', adjustable='box')\n",
    "\n",
    "# ticks & legend\n",
    "ax.minorticks_on()\n",
    "ax.legend(loc=\"upper left\", frameon=False)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# -------------\n",
    "# save\n",
    "# -------------\n",
    "fig.savefig(\"qq_supnorm_Q_vs_Y.png\",  bbox_inches=\"tight\")\n",
    "fig.savefig(\"qq_supnorm_Q_vs_Y.pdf\",  bbox_inches=\"tight\")\n",
    "fig.savefig(\"qq_supnorm_Q_vs_Y.svg\",  bbox_inches=\"tight\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abd5ead2",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (16GB venv)",
   "language": "python",
   "name": "jupyter_env"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
