{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install cvxpy\n",
    "%pip install pandas\n",
    "%pip install matplotlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "JUxr_11kg-Pa"
   },
   "outputs": [],
   "source": [
    "# === Cell 1: Imports / setup ===\n",
    "import numpy as np\n",
    "import cvxpy as cp\n",
    "import time\n",
    "from dataclasses import dataclass\n",
    "from typing import List, Tuple, Dict, Optional, Callable\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "\n",
    "np.set_printoptions(precision=4, suppress=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "BKrRu6Y1hAE8"
   },
   "outputs": [],
   "source": [
    "# === Cell 2: Environment ===\n",
    "UP, RIGHT, DOWN, LEFT = 0, 1, 2, 3\n",
    "ACTIONS = np.array([UP, RIGHT, DOWN, LEFT])\n",
    "\n",
    "@dataclass\n",
    "class GridWorldConfig:\n",
    "    G: int = 9\n",
    "    slip: float = 0.10\n",
    "    step_penalty: float = -0.01\n",
    "    goal_reward: float = 1.0\n",
    "    H: int = 5\n",
    "    start: Tuple[int,int] = (0,0)\n",
    "    goal: Optional[Tuple[int,int]] = None  # default (G-1,G-1)\n",
    "\n",
    "class GridWorld:\n",
    "    def __init__(self, cfg: GridWorldConfig):\n",
    "        self.cfg = cfg\n",
    "        self.G = cfg.G\n",
    "        self.goal = (self.G-1, self.G-1) if cfg.goal is None else cfg.goal\n",
    "        self.rng = np.random.default_rng(0)\n",
    "\n",
    "    def reset(self, seed: Optional[int] = None):\n",
    "        if seed is not None:\n",
    "            self.rng = np.random.default_rng(seed)\n",
    "        return self.cfg.start\n",
    "\n",
    "    def _move(self, s: Tuple[int,int], a: int) -> Tuple[int,int]:\n",
    "        x, y = s\n",
    "        if a == UP:    y = min(self.G-1, y+1)\n",
    "        if a == RIGHT: x = min(self.G-1, x+1)\n",
    "        if a == DOWN:  y = max(0, y-1)\n",
    "        if a == LEFT:  x = max(0, x-1)\n",
    "        return (x, y)\n",
    "\n",
    "    def step(self, s: Tuple[int,int], a: int) -> Tuple[Tuple[int,int], float]:\n",
    "        if self.rng.random() < self.cfg.slip:\n",
    "            a = int(self.rng.choice(ACTIONS))\n",
    "        s_next = self._move(s, a)\n",
    "        reward = self.cfg.goal_reward if (s_next == self.goal) else self.cfg.step_penalty\n",
    "        return s_next, reward\n",
    "\n",
    "    def all_states(self) -> List[Tuple[int,int]]:\n",
    "        return [(x,y) for x in range(self.G) for y in range(self.G)]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "kVAV5LTShCP9"
   },
   "outputs": [],
   "source": [
    "# === Cell 3: DP to compute V* ===\n",
    "def transitions_and_rewards(env: GridWorld, s: Tuple[int,int], a: int):\n",
    "    cfg = env.cfg\n",
    "    outcomes = []\n",
    "    p_intended = 1.0 - cfg.slip\n",
    "    s_next = env._move(s, a)\n",
    "    r = cfg.goal_reward if (s_next == env.goal) else cfg.step_penalty\n",
    "    outcomes.append((s_next, p_intended, r))\n",
    "    p_slip_each = cfg.slip / len(ACTIONS)\n",
    "    for a_alt in ACTIONS:\n",
    "        s_next = env._move(s, a_alt)\n",
    "        r = cfg.goal_reward if (s_next == env.goal) else cfg.step_penalty\n",
    "        outcomes.append((s_next, p_slip_each, r))\n",
    "    from collections import defaultdict\n",
    "    acc = defaultdict(lambda: [0.0, 0.0])\n",
    "    for s2, p, r in outcomes:\n",
    "        acc[s2][0] += p\n",
    "        acc[s2][1] += p * r\n",
    "    merged = [(s2, P, Rexp / (P if P>0 else 1.0)) for s2,(P,Rexp) in acc.items()]\n",
    "    return merged\n",
    "\n",
    "def dp_optimal_values(env: GridWorld):\n",
    "    H = env.cfg.H\n",
    "    states = env.all_states()\n",
    "    V = {H+1: {s: 0.0 for s in states}}\n",
    "    Q = {}\n",
    "    for h in range(H, 0, -1):\n",
    "        Q[h] = {}\n",
    "        V[h] = {}\n",
    "        for s in states:\n",
    "            best = -1e9\n",
    "            qa = {}\n",
    "            for a in ACTIONS:\n",
    "                total = 0.0\n",
    "                for (s2, P, rbar) in transitions_and_rewards(env, s, a):\n",
    "                    total += P * (rbar + V[h+1][s2])\n",
    "                qa[a] = total\n",
    "                best = max(best, total)\n",
    "            Q[h][s] = qa\n",
    "            V[h][s] = best\n",
    "    return V, Q\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "9Q3EvLc9hEsu"
   },
   "outputs": [],
   "source": [
    "# === Cell 4: Kernels ===\n",
    "def norm_state(s, G):\n",
    "    x, y = s\n",
    "    return np.array([x/(G-1), y/(G-1)], dtype=float)\n",
    "\n",
    "def rbf_state(s1, s2, lengthscale=0.35, G=9):\n",
    "    x = norm_state(s1, G); y = norm_state(s2, G)\n",
    "    diff = x - y\n",
    "    return np.exp(-0.5 * (diff @ diff) / (lengthscale**2))\n",
    "\n",
    "def kernel_state_action(z1, z2, lengthscale=0.35, G=9):\n",
    "    (s1, a1), (s2, a2) = z1, z2\n",
    "    if a1 != a2:\n",
    "        return 0.0\n",
    "    return rbf_state(s1, s2, lengthscale=lengthscale, G=G)\n",
    "\n",
    "def gram_states(anchors: List[Tuple[int,int]], lengthscale=0.35, G=9):\n",
    "    m = len(anchors)\n",
    "    L = np.zeros((m,m))\n",
    "    for i, si in enumerate(anchors):\n",
    "        for j, sj in enumerate(anchors):\n",
    "            L[i,j] = rbf_state(si, sj, lengthscale=lengthscale, G=G)\n",
    "    return 0.5 * (L + L.T)\n",
    "\n",
    "def gram_z(Z: List[Tuple[Tuple[int,int], int]], lengthscale=0.35, G=9):\n",
    "    n = len(Z)\n",
    "    if n == 0:\n",
    "        return np.zeros((0,0))\n",
    "    K = np.zeros((n,n))\n",
    "    for i, zi in enumerate(Z):\n",
    "        for j, zj in enumerate(Z):\n",
    "            K[i,j] = kernel_state_action(zi, zj, lengthscale=lengthscale, G=G)\n",
    "    return 0.5 * (K + K.T)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "-ehhSiivhIoV"
   },
   "outputs": [],
   "source": [
    "# === Cell 5: QCQP Projection (robust) ===\n",
    "def solve_projection_qcqp(L: np.ndarray,\n",
    "                          v: np.ndarray,\n",
    "                          B: float,\n",
    "                          U: float,\n",
    "                          solver: Optional[str] = None,\n",
    "                          verbose: bool = False,\n",
    "                          scs_max_iters: int = 8000,\n",
    "                          scs_eps: float = 5e-5):\n",
    "    m = L.shape[0]\n",
    "    if m == 0:\n",
    "        return np.zeros((0,)), np.zeros((0,)), dict(status=\"EMPTY\", obj=0.0, solver=None, runtime=0.0)\n",
    "    Lsym = 0.5 * (L + L.T)\n",
    "    jitter = 1e-10  # objective-only jitter\n",
    "\n",
    "    alpha = cp.Variable(m)\n",
    "    obj = (1.0/m) * cp.sum_squares(Lsym @ alpha - v) + jitter * cp.sum_squares(alpha)\n",
    "    cons = [\n",
    "        cp.quad_form(alpha, Lsym) <= B**2,\n",
    "        Lsym @ alpha >= 0.0,\n",
    "        Lsym @ alpha <= U * np.ones(m)\n",
    "    ]\n",
    "    prob = cp.Problem(cp.Minimize(obj), cons)\n",
    "\n",
    "    installed = set(cp.installed_solvers())\n",
    "    chosen = solver or (\"MOSEK\" if \"MOSEK\" in installed else (\"SCS\" if \"SCS\" in installed else None))\n",
    "    try:\n",
    "        if chosen == \"SCS\":\n",
    "            prob.solve(solver=\"SCS\", verbose=verbose, max_iters=scs_max_iters, eps=scs_eps)\n",
    "        elif chosen == \"MOSEK\":\n",
    "            prob.solve(solver=\"MOSEK\", verbose=verbose)\n",
    "        else:\n",
    "            prob.solve(verbose=verbose)\n",
    "    except Exception as e:\n",
    "        chosen = f\"EXC:{type(e).__name__}\"\n",
    "\n",
    "    if prob.status not in (cp.OPTIMAL, cp.OPTIMAL_INACCURATE):\n",
    "        # Fallback: ridge + clip + min-norm preimage scaled into ball\n",
    "        Lreg = Lsym + 1e-6 * np.eye(m)\n",
    "        a_tmp = np.linalg.solve(Lreg, v)\n",
    "        vals = Lsym @ a_tmp\n",
    "        vals = np.clip(vals, 0.0, U)\n",
    "        a = np.linalg.solve(Lreg, vals)\n",
    "        norm2 = float(a.T @ Lsym @ a)\n",
    "        if norm2 > B**2:\n",
    "            a *= (B / (np.sqrt(norm2) + 1e-12))\n",
    "        return a, Lsym @ a, dict(status=f\"FALLBACK_{prob.status}\", obj=np.nan, solver=chosen, runtime=np.nan)\n",
    "\n",
    "    a = alpha.value\n",
    "    return a, Lsym @ a, dict(status=prob.status, obj=prob.value, solver=chosen, runtime=np.nan)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "GechAW6mhLBd"
   },
   "outputs": [],
   "source": [
    "# === Cell 6: Kernel Q estimator (robust) ===\n",
    "@dataclass\n",
    "class StageDataset:\n",
    "    Z: List[Tuple[Tuple[int,int], int]]\n",
    "    y: List[float]\n",
    "    max_size: int = 400\n",
    "\n",
    "    def add(self, z, yval):\n",
    "        self.Z.append(z)\n",
    "        self.y.append(float(yval))\n",
    "        if len(self.Z) > self.max_size:\n",
    "            self.Z = self.Z[-self.max_size:]\n",
    "            self.y = self.y[-self.max_size:]\n",
    "\n",
    "class KernelQEstimator:\n",
    "    def __init__(self, lengthscale=0.35, ridge=1e-2, G=9):\n",
    "        self.ls = lengthscale\n",
    "        self.ridge = ridge\n",
    "        self.G = G\n",
    "        self.Z = []\n",
    "        self.alpha = None\n",
    "        self.K_factor = None  # Cholesky\n",
    "\n",
    "    def fit(self, Z: List[Tuple[Tuple[int,int], int]], y: np.ndarray):\n",
    "        self.Z = list(Z)\n",
    "        n = len(self.Z)\n",
    "        if n == 0:\n",
    "            self.alpha = None\n",
    "            self.K_factor = None\n",
    "            return\n",
    "        K = gram_z(self.Z, lengthscale=self.ls, G=self.G)\n",
    "        K_reg = K + self.ridge * np.eye(n)\n",
    "        try:\n",
    "            L = np.linalg.cholesky(K_reg)\n",
    "            self.alpha = np.linalg.solve(L.T, np.linalg.solve(L, y))\n",
    "            self.K_factor = L\n",
    "        except np.linalg.LinAlgError:\n",
    "            self.alpha = np.linalg.solve(K_reg, y)\n",
    "            self.K_factor = None\n",
    "\n",
    "    def predict_mean_sigma(self, z: Tuple[Tuple[int,int], int]) -> Tuple[float, float]:\n",
    "        if self.alpha is None or len(self.Z) == 0:\n",
    "            return 0.0, 1.0\n",
    "        kvec = np.array([kernel_state_action(z, zi, lengthscale=self.ls, G=self.G) for zi in self.Z])\n",
    "        mean = float(kvec @ self.alpha)\n",
    "        if self.K_factor is not None:\n",
    "            v = np.linalg.solve(self.K_factor, kvec)\n",
    "            sigma2 = max(0.0, 1.0 - float(v @ v))\n",
    "        else:\n",
    "            K = gram_z(self.Z, lengthscale=self.ls, G=self.G) + self.ridge * np.eye(len(self.Z))\n",
    "            w = np.linalg.solve(K, kvec)\n",
    "            sigma2 = max(0.0, 1.0 - float(kvec @ w))\n",
    "        return mean, float(np.sqrt(max(1e-12, sigma2)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "N8TTTdmuhNj_"
   },
   "outputs": [],
   "source": [
    "# === Cell 7: Planning & policies ===\n",
    "def build_anchor_states(G: int, stride: int = 2) -> List[Tuple[int,int]]:\n",
    "    return [(x,y) for x in range(0, G, stride) for y in range(0, G, stride)]\n",
    "\n",
    "def optimistic_targets_for_V(stage_estimator: KernelQEstimator,\n",
    "                             anchors: List[Tuple[int,int]],\n",
    "                             beta: float,\n",
    "                             U_h: float,\n",
    "                             G: int) -> np.ndarray:\n",
    "    v = []\n",
    "    for s in anchors:\n",
    "        qmax = 0.0\n",
    "        for a in ACTIONS:\n",
    "            mu, sig = stage_estimator.predict_mean_sigma(((s,a)))\n",
    "            q_ucb = mu + beta * sig\n",
    "            if q_ucb > qmax:\n",
    "                qmax = q_ucb\n",
    "        v.append(float(np.clip(qmax, 0.0, U_h)))\n",
    "    return np.array(v, dtype=float)\n",
    "\n",
    "def ridge_fit_value(L: np.ndarray, v: np.ndarray, lam_val: float) -> np.ndarray:\n",
    "    m = L.shape[0]\n",
    "    return np.linalg.solve(L + lam_val * np.eye(m), v)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Q4pZdrfthRR_"
   },
   "outputs": [],
   "source": [
    "# === Cell 8: Learners ===\n",
    "@dataclass\n",
    "class LearnerConfig:\n",
    "    lengthscale: float = 0.35\n",
    "    ridge_q: float = 1e-2\n",
    "    ridge_v: float = 1e-3\n",
    "    beta_c: float = 0.8\n",
    "    delta: float = 0.1\n",
    "\n",
    "class KOVIProjLearner:\n",
    "    def __init__(self, env: GridWorld, anchors: List[Tuple[int,int]],\n",
    "                 B: float, cfg: LearnerConfig):\n",
    "        self.env = env\n",
    "        self.anchors = anchors\n",
    "        self.B = B\n",
    "        self.cfg = cfg\n",
    "        self.H = env.cfg.H\n",
    "        self.G = env.cfg.G\n",
    "        self.U_h = [self.H - h + 1 for h in range(1, self.H+1)]\n",
    "        self.L = gram_states(anchors, lengthscale=cfg.lengthscale, G=self.G)\n",
    "        self.buffers = [StageDataset([], [], max_size=400) for _ in range(self.H+1)]\n",
    "        self.V_alpha = [None]*(self.H+2)\n",
    "        self.q_est = [KernelQEstimator(lengthscale=cfg.lengthscale, ridge=cfg.ridge_q, G=self.G) for _ in range(self.H+1)]\n",
    "\n",
    "    def plan(self):\n",
    "        m = len(self.anchors)\n",
    "        beta = self.cfg.beta_c * np.sqrt(np.log( (m*self.H+1)/self.cfg.delta ))\n",
    "        for h in range(self.H, 0, -1):\n",
    "            Z = self.buffers[h].Z\n",
    "            y = np.array(self.buffers[h].y) if len(self.buffers[h].y)>0 else np.zeros((0,))\n",
    "            self.q_est[h].fit(Z, y)\n",
    "            v_t = optimistic_targets_for_V(self.q_est[h], self.anchors, beta, self.U_h[h-1], self.G)\n",
    "            alpha, yproj, info = solve_projection_qcqp(self.L, v_t, B=self.B, U=self.U_h[h-1])\n",
    "            self.V_alpha[h] = alpha\n",
    "\n",
    "    def V(self, h: int, s: Tuple[int,int]) -> float:\n",
    "        if self.V_alpha[h] is None:\n",
    "            return 0.0\n",
    "        kvec = np.array([rbf_state(s, sj, lengthscale=self.cfg.lengthscale, G=self.G) for sj in self.anchors])\n",
    "        return float(kvec @ self.V_alpha[h])\n",
    "\n",
    "    def act(self, h: int, s: Tuple[int,int]) -> int:\n",
    "        beta = self.cfg.beta_c * np.sqrt(np.log( (len(self.anchors)*self.H+1)/self.cfg.delta ))\n",
    "        best_a, best_q = None, -1e9\n",
    "        for a in ACTIONS:\n",
    "            mu, sig = self.q_est[h].predict_mean_sigma(((s,a)))\n",
    "            q = mu + beta*sig\n",
    "            if q > best_q:\n",
    "                best_q, best_a = q, a\n",
    "        return int(best_a)\n",
    "\n",
    "    def observe(self, h: int, s: Tuple[int,int], a: int, r: float, s_next: Tuple[int,int]):\n",
    "        y = r + (self.V(h+1, s_next) if h < self.H else 0.0)\n",
    "        self.buffers[h].add(((s,a)), y)\n",
    "\n",
    "class KOVI0Learner(KOVIProjLearner):\n",
    "    def plan(self):\n",
    "        m = len(self.anchors)\n",
    "        beta = self.cfg.beta_c * np.sqrt(np.log( (m*self.H+1)/self.cfg.delta ))\n",
    "        for h in range(self.H, 0, -1):\n",
    "            Z = self.buffers[h].Z\n",
    "            y = np.array(self.buffers[h].y) if len(self.buffers[h].y)>0 else np.zeros((0,))\n",
    "            self.q_est[h].fit(Z, y)\n",
    "            v_t = optimistic_targets_for_V(self.q_est[h], self.anchors, beta, self.U_h[h-1], self.G)\n",
    "            self.V_alpha[h] = ridge_fit_value(self.L, v_t, lam_val=self.cfg.ridge_v)\n",
    "\n",
    "class KernelLSVIeps:\n",
    "    def __init__(self, env: GridWorld, anchors: List[Tuple[int,int]],\n",
    "                 eps0: float, cfg: LearnerConfig):\n",
    "        self.env = env\n",
    "        self.anchors = anchors\n",
    "        self.cfg = cfg\n",
    "        self.H = env.cfg.H\n",
    "        self.G = env.cfg.G\n",
    "        self.buffers = [StageDataset([], [], max_size=400) for _ in range(self.H+1)]\n",
    "        self.q_est = [KernelQEstimator(lengthscale=cfg.lengthscale, ridge=cfg.ridge_q, G=self.G) for _ in range(self.H+1)]\n",
    "        self.V_alpha = [None]*(self.H+2)\n",
    "        self.L = gram_states(anchors, lengthscale=cfg.lengthscale, G=self.G)\n",
    "        self.eps0 = eps0\n",
    "\n",
    "    def plan(self):\n",
    "        for h in range(self.H, 0, -1):\n",
    "            Z = self.buffers[h].Z\n",
    "            y = np.array(self.buffers[h].y) if len(self.buffers[h].y)>0 else np.zeros((0,))\n",
    "            self.q_est[h].fit(Z, y)\n",
    "            U_h = self.H - h + 1\n",
    "            v = []\n",
    "            for s in self.anchors:\n",
    "                qmax = 0.0\n",
    "                for a in ACTIONS:\n",
    "                    mu, _ = self.q_est[h].predict_mean_sigma(((s,a)))\n",
    "                    qmax = max(qmax, mu)\n",
    "                v.append(np.clip(qmax, 0.0, U_h))\n",
    "            self.V_alpha[h] = ridge_fit_value(self.L, np.array(v), lam_val=self.cfg.ridge_v)\n",
    "\n",
    "    def V(self, h: int, s: Tuple[int,int]) -> float:\n",
    "        if self.V_alpha[h] is None:\n",
    "            return 0.0\n",
    "        kvec = np.array([rbf_state(s, sj, lengthscale=self.cfg.lengthscale, G=self.G) for sj in self.anchors])\n",
    "        return float(kvec @ self.V_alpha[h])\n",
    "\n",
    "    def act(self, h: int, s: Tuple[int,int], episode_idx: int) -> int:\n",
    "        eps = self.eps0 / (1.0 + 0.03*episode_idx)\n",
    "        if np.random.random() < eps:\n",
    "            return int(np.random.choice(ACTIONS))\n",
    "        best_a, best_q = None, -1e9\n",
    "        for a in ACTIONS:\n",
    "            mu, _ = self.q_est[h].predict_mean_sigma(((s,a)))\n",
    "            if mu > best_q:\n",
    "                best_q, best_a = mu, a\n",
    "        return int(best_a)\n",
    "\n",
    "    def observe(self, h: int, s: Tuple[int,int], a: int, r: float, s_next: Tuple[int,int]):\n",
    "        y = r + (self.V(h+1, s_next) if h < self.H else 0.0)\n",
    "        self.buffers[h].add(((s,a)), y)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1ANh6QyShVcp"
   },
   "outputs": [],
   "source": [
    "# === Cell 9: Harness ===\n",
    "@dataclass\n",
    "class RunConfig:\n",
    "    seeds: List[int]\n",
    "    episodes: int\n",
    "    anchors_stride: int = 2\n",
    "    B: float = 4.0\n",
    "    eps0: float = 0.15\n",
    "\n",
    "def run_experiment(env_cfg: GridWorldConfig, run_cfg: RunConfig):\n",
    "    env = GridWorld(env_cfg)\n",
    "    Vstar, Qstar = dp_optimal_values(env)\n",
    "    V1_star = Vstar[1][env_cfg.start]\n",
    "    print(f\"Optimal benchmark V1* = {V1_star:.4f}\")\n",
    "\n",
    "    anchors = build_anchor_states(env_cfg.G, stride=run_cfg.anchors_stride)\n",
    "    learner_cfg = LearnerConfig(lengthscale=0.35, ridge_q=1e-2, ridge_v=1e-3, beta_c=0.8, delta=0.1)\n",
    "\n",
    "    results = []\n",
    "    for seed in run_cfg.seeds:\n",
    "        kovip = KOVIProjLearner(env, anchors, B=run_cfg.B, cfg=learner_cfg)\n",
    "        kovi0 = KOVI0Learner(env, anchors, B=run_cfg.B, cfg=learner_cfg)\n",
    "        lsvie = KernelLSVIeps(env, anchors, eps0=run_cfg.eps0, cfg=learner_cfg)\n",
    "\n",
    "        returns = { 'KOVI-Proj': [], 'KOVI0': [], 'Kernel-LSVI-eps': [] }\n",
    "        cumreg  = { 'KOVI-Proj': [], 'KOVI0': [], 'Kernel-LSVI-eps': [] }\n",
    "        sums    = { 'KOVI-Proj': 0.0, 'KOVI0': 0.0, 'Kernel-LSVI-eps': 0.0 }\n",
    "\n",
    "        for ep in range(1, run_cfg.episodes+1):\n",
    "            kovip.plan(); kovi0.plan(); lsvie.plan()\n",
    "            # KOVI-Proj\n",
    "            s = env.reset(seed=1000+seed*100+ep)\n",
    "            ret = 0.0\n",
    "            for h in range(1, env_cfg.H+1):\n",
    "                a = kovip.act(h, s)\n",
    "                s2, r = env.step(s, a)\n",
    "                kovip.observe(h, s, a, r, s2)\n",
    "                ret += r; s = s2\n",
    "            returns['KOVI-Proj'].append(ret)\n",
    "            sums['KOVI-Proj'] += (V1_star - ret)\n",
    "            cumreg['KOVI-Proj'].append(sums['KOVI-Proj'])\n",
    "            # KOVI0\n",
    "            s = env.reset(seed=2000+seed*100+ep)\n",
    "            ret = 0.0\n",
    "            for h in range(1, env_cfg.H+1):\n",
    "                a = kovi0.act(h, s)\n",
    "                s2, r = env.step(s, a)\n",
    "                kovi0.observe(h, s, a, r, s2)\n",
    "                ret += r; s = s2\n",
    "            returns['KOVI0'].append(ret)\n",
    "            sums['KOVI0'] += (V1_star - ret)\n",
    "            cumreg['KOVI0'].append(sums['KOVI0'])\n",
    "            # Kernel-LSVI-eps\n",
    "            s = env.reset(seed=3000+seed*100+ep)\n",
    "            ret = 0.0\n",
    "            for h in range(1, env_cfg.H+1):\n",
    "                a = lsvie.act(h, s, episode_idx=ep-1)\n",
    "                s2, r = env.step(s, a)\n",
    "                lsvie.observe(h, s, a, r, s2)\n",
    "                ret += r; s = s2\n",
    "            returns['Kernel-LSVI-eps'].append(ret)\n",
    "            sums['Kernel-LSVI-eps'] += (V1_star - ret)\n",
    "            cumreg['Kernel-LSVI-eps'].append(sums['Kernel-LSVI-eps'])\n",
    "\n",
    "        for alg in returns.keys():\n",
    "            results.append(dict(seed=seed, algorithm=alg,\n",
    "                                returns=np.array(returns[alg]),\n",
    "                                cumreg=np.array(cumreg[alg])))\n",
    "    return V1_star, results\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import matplotlib as mpl\n",
    "\n",
    "# Set global figure params\n",
    "mpl.rcParams['figure.dpi'] = 400           # High resolution\n",
    "mpl.rcParams['savefig.dpi'] = 400          # High resolution when saving\n",
    "mpl.rcParams['figure.figsize'] = (6, 4)    # Default figure size (in inches)\n",
    "\n",
    "# Font sizes for paper-quality plots\n",
    "mpl.rcParams['font.size'] = 14             # Base font size\n",
    "mpl.rcParams['axes.titlesize'] = 16        # Title size\n",
    "mpl.rcParams['axes.labelsize'] = 14        # X/Y label size\n",
    "mpl.rcParams['xtick.labelsize'] = 12       # Tick labels\n",
    "mpl.rcParams['ytick.labelsize'] = 12\n",
    "mpl.rcParams['legend.fontsize'] = 12\n",
    "mpl.rcParams['figure.titlesize'] = 16\n",
    "\n",
    "# Optional: use LaTeX-style fonts for publication\n",
    "# mpl.rcParams['text.usetex'] = True\n",
    "# mpl.rcParams['font.family'] = 'serif'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "nqmQpMZYhWV7"
   },
   "outputs": [],
   "source": [
    "# === Cell 10: Run, plot, save ===\n",
    "# env_cfg = GridWorldConfig(G=9, slip=0.10, step_penalty=-0.01, goal_reward=1.0, H=5, start=(0,0), goal=None)\n",
    "# run_cfg = RunConfig(seeds=[0,1,2], episodes=40, anchors_stride=2, B=4.0, eps0=0.15)\n",
    "\n",
    "env_cfg = GridWorldConfig(\n",
    "    G=5,            # 5x5 grid\n",
    "    H=10,           # horizon >= 2*(G-1) = 8\n",
    "    slip=0.10,\n",
    "    step_penalty=-0.01,\n",
    "    goal_reward=1.0,\n",
    "    start=(0,0),\n",
    "    goal=(4,4)\n",
    ")\n",
    "run_cfg = RunConfig(seeds=[0,1,2], episodes=100, anchors_stride=1, B=4.0, eps0=0.15)\n",
    "\n",
    "\n",
    "V1_star, results = run_experiment(env_cfg, run_cfg)\n",
    "\n",
    "algs = ['KOVI-Proj','KOVI0','Kernel-LSVI-eps']\n",
    "episodes = run_cfg.episodes\n",
    "cum_curves = {alg: [] for alg in algs}\n",
    "ret_curves = {alg: [] for alg in algs}\n",
    "for r in results:\n",
    "    alg = r['algorithm']\n",
    "    cum_curves[alg].append(r['cumreg'])\n",
    "    ret_curves[alg].append(r['returns'])\n",
    "mean_cum = {alg: np.mean(np.vstack(cum_curves[alg]), axis=0) for alg in algs}\n",
    "sem_cum  = {alg: np.std(np.vstack(cum_curves[alg]), axis=0, ddof=1)/np.sqrt(len(run_cfg.seeds)) for alg in algs}\n",
    "\n",
    "# Plot\n",
    "fig = plt.figure(figsize=(6,4))\n",
    "x = np.arange(1, episodes+1)\n",
    "for alg in algs:\n",
    "    y = mean_cum[alg]\n",
    "    e = sem_cum[alg]\n",
    "    plt.plot(x, y, label=alg)\n",
    "    plt.fill_between(x, y-e, y+e, alpha=0.2)\n",
    "plt.xlabel(\"Episode\"); plt.ylabel(\"Cum. Regret (mean over seeds)\")\n",
    "plt.title(\"GridWorld 9x9 (H=5): Cum. Regret\")\n",
    "plt.legend(); plt.tight_layout()\n",
    "plt.savefig(\"gridworld_regret.png\", dpi=400); plt.show()\n",
    "\n",
    "def lin_slope(x, y):\n",
    "    X = np.vstack([np.ones_like(x), x]).T\n",
    "    b0, b1 = np.linalg.lstsq(X, y, rcond=None)[0]\n",
    "    return float(b1)\n",
    "\n",
    "summary_rows = []\n",
    "for alg in algs:\n",
    "    y = mean_cum[alg]\n",
    "    slope = lin_slope(x, y)\n",
    "    final = float(y[-1])\n",
    "    rets = np.vstack(ret_curves[alg])\n",
    "    avg_return = float(np.mean(rets))\n",
    "    sem_return = float(np.std(rets, ddof=1)/np.sqrt(rets.shape[0]))\n",
    "    summary_rows.append(dict(algorithm=alg,\n",
    "                             mean_cum_regret=final,\n",
    "                             regret_slope_per_ep=slope,\n",
    "                             mean_return=avg_return,\n",
    "                             sem_return=sem_return))\n",
    "summary_df = pd.DataFrame(summary_rows)\n",
    "print(summary_df)\n",
    "\n",
    "summary_df.to_csv(\"gridworld_results_summary.csv\", index=False)\n",
    "traj_rows = []\n",
    "for alg in algs:\n",
    "    for ep, val in enumerate(mean_cum[alg], 1):\n",
    "        traj_rows.append(dict(algorithm=alg, episode=ep, cum_regret=float(val)))\n",
    "traj_df = pd.DataFrame(traj_rows)\n",
    "traj_df.to_csv(\"gridworld_trajectories_mean.csv\", index=False)\n",
    "print(\"Saved: gridworld_regret.png, gridworld_results_summary.csv, gridworld_trajectories_mean.csv\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "HbWOmUe9hYvE"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "T4",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
