{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "stuffed-sweden"
   },
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'cvxpy'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-1-54ffd62853d9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrandom\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mcvxpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mcp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     12\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mrandom\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrandrange\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;31m#from matplotlib.backends.backend_pdf import PdfPages\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'cvxpy'"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import scipy\n",
    "import matplotlib.pylab as plt\n",
    "plt.rcParams['pdf.fonttype'] = 42\n",
    "plt.rcParams['ps.fonttype'] = 42\n",
    "#import torch\n",
    "import math\n",
    "import copy\n",
    "import random\n",
    "import os\n",
    "import cvxpy as cp\n",
    "from random import randrange\n",
    "#from matplotlib.backends.backend_pdf import PdfPages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "short-picnic"
   },
   "outputs": [],
   "source": [
    "dim = 5\n",
    "H = 5\n",
    "T = 100\n",
    "repeat =1\n",
    "ACTION = np.power(2,dim-1)\n",
    "STATE = H+2\n",
    "Inner = 3\n",
    "soft = 10\n",
    "\n",
    "thetastar = np.append(0.01*np.ones(dim-1), 1)\n",
    "delta = 0.05\n",
    "\n",
    "# dim = 5 0.9\n",
    "# H = 5\n",
    "# T = 100000\n",
    "# ACTION = np.power(2,dim-1)\n",
    "# STATE = H+2\n",
    "\n",
    "# thetastar = np.append(0.1*np.ones(dim-1), 1)\n",
    "# delta = 0.3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "referenced-produce"
   },
   "outputs": [],
   "source": [
    "def trans_action(a, d):###d is dimension, dimension of a is d-1\n",
    "    tt = np.zeros(d-1)-1\n",
    "    bb = bin(a)\n",
    "    for i in range(len(bb)-2):\n",
    "        tt[i] = 2*np.double(bb[i+2])-1\n",
    "    return(tt)\n",
    "\n",
    "\n",
    "def phi(s,a,sp):\n",
    "    tt = np.zeros(dim)\n",
    "    d = dim\n",
    "    aa = trans_action(a,d)\n",
    "    if s<H:\n",
    "        if sp == s+1:            \n",
    "            for i in range(d-1):\n",
    "                tt[i] = -aa[i]\n",
    "            tt[d-1] = 1-delta\n",
    "    if s<H:\n",
    "        if sp == H+1:            \n",
    "            for i in range(d-1):\n",
    "                tt[i] = aa[i]\n",
    "            tt[d-1] = delta\n",
    "    if s==H:\n",
    "        if sp == H:            \n",
    "            for i in range(d-1):\n",
    "                tt[i] = 0\n",
    "            tt[d-1] = 1\n",
    "    if s==H+1:\n",
    "        if sp == H+1:            \n",
    "            for i in range(d-1):\n",
    "                tt[i] = 0\n",
    "            tt[d-1] = 1\n",
    "    return tt\n",
    "\n",
    "\n",
    "def phiv(s,a,v):\n",
    "    if s==H+1:\n",
    "        return phi(s,a,H+1)*v[H+1]\n",
    "    return (phi(s,a,H+1)*v[H+1] + phi(s,a,s+1)*v[s+1])\n",
    "\n",
    "def proba(s,a,sp):\n",
    "    return np.dot(phi(s,a,sp), thetastar)\n",
    "\n",
    "def reward(s,a):\n",
    "    if s== H+1:\n",
    "        return 1\n",
    "    return 0\n",
    "\n",
    "def advreward(s,a):\n",
    "    return 1-reward(s,a)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "legitimate-journalist",
    "outputId": "ace2837b-36a8-48ea-abc4-153dca62f86b"
   },
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'repeat' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-3-9513f36575d2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mjjjj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrepeat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m     \u001b[0;31m############tabular###################\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0ms_cur\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0mREWARD\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0mTOTALREWARD\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'repeat' is not defined"
     ]
    }
   ],
   "source": [
    "for jjjj in range(repeat):\n",
    "    ############tabular###################\n",
    "    s_cur = 0\n",
    "    REWARD = 0\n",
    "    TOTALREWARD = []\n",
    "    for t in range(T):\n",
    "        s_cur = 0\n",
    "        for stage in range(H):\n",
    "            a_cur = np.int(np.power(2, dim-1))-1\n",
    "            s_next = s_cur\n",
    "            #a_cur = randrange(ACTION)\n",
    "            if s_cur< H:\n",
    "                s_next = H+1-np.random.binomial(1, proba(s_cur, a_cur , s_cur+1))*(H-s_cur)\n",
    "            #print( proba(s_cur, a_cur , s_cur+1))\n",
    "            REWARD += reward(s_cur, a_cur)\n",
    "            TOTALREWARD.append(REWARD)\n",
    "            s_cur = s_next\n",
    "            #print([t, REWARD, s_cur, a_cur, ])\n",
    "    #     print(REWARD)\n",
    "    path  = \"optimal_lowerbound_0.1_\" + str(jjjj) +\".txt\"\n",
    "    fr = open(path,'w')\n",
    "    for i in TOTALREWARD:\n",
    "        fr.write(str(i))\n",
    "        fr.write(\"\\n\")\n",
    "    fr.close() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "id": "mental-moisture"
   },
   "outputs": [],
   "source": [
    "for jjjj in range(repeat):\n",
    "    ############Random###################\n",
    "    s_cur = 0\n",
    "    REWARD = 0\n",
    "    TOTALREWARD = []\n",
    "    for t in range(T):\n",
    "        s_cur = 0\n",
    "        for stage in range(H):\n",
    "            #a_cur = randrange(ACTION)\n",
    "            s_next = s_cur\n",
    "            a_cur = randrange(ACTION)\n",
    "            if s_cur< H:\n",
    "                s_next = H+1-np.random.binomial(1, proba(s_cur, a_cur , s_cur+1))*(H-s_cur)\n",
    "            #print( proba(s_cur, a_cur , s_cur+1))\n",
    "            REWARD += reward(s_cur, a_cur)\n",
    "            TOTALREWARD.append(REWARD)\n",
    "            s_cur = s_next\n",
    "            #print([t, REWARD, s_cur, a_cur, ])\n",
    "    #     print(REWARD)\n",
    "    path  = \"random_lowerbound_0.1_\" + str(jjjj) +\".txt\"\n",
    "    fr = open(path,'w')\n",
    "    for i in TOTALREWARD:\n",
    "        fr.write(str(i))\n",
    "        fr.write(\"\\n\")\n",
    "    fr.close() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "artistic-details",
    "outputId": "c14c1da2-3ca6-4021-d126-f72be5e125e7"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "0\n",
      "1\n",
      "2\n"
     ]
    }
   ],
   "source": [
    "for jjjj in range(repeat):\n",
    "    ############General###################\n",
    "    print(jjjj)\n",
    "    s_cur = np.zeros(H+1).astype(int)\n",
    "    a_cur = np.zeros(H).astype(int)\n",
    "    QVALUE = np.zeros((STATE, ACTION, H+1))\n",
    "    VVALUE = np.zeros((STATE, H+1))\n",
    "    LAMBDA = 1\n",
    "    beta=2.75\n",
    "    SIGMA = np.zeros((dim, dim, H))\n",
    "    for stage in range(H):\n",
    "        SIGMA[:,:,stage] = LAMBDA*np.diag(np.ones(dim))\n",
    "    BB = np.zeros((dim, H))\n",
    "    POLICY = np.ones((STATE, ACTION, H+1))/ACTION\n",
    "    REWARD = 0\n",
    "    TOTALREWARD = []\n",
    "    for t in range(T):\n",
    "        print(t) \n",
    "        s_cur[0] = 0\n",
    "        for stage in range(H):\n",
    "            #a_cur = randrange(ACTION)\n",
    "            a_cur[stage] = np.random.choice(ACTION, 1, p=POLICY[s_cur[stage],:,stage])\n",
    "            #a_cur[stage] = randrange(ACTION)\n",
    "            s_next = s_cur[stage]\n",
    "            if s_cur[stage]< H:\n",
    "                s_next = H+1-np.random.binomial(1, proba(s_cur[stage], a_cur[stage] , s_cur[stage]+1))*(H-s_cur[stage])\n",
    "            #print( proba(s_cur, a_cur , s_cur+1))\n",
    "            REWARD += reward(s_cur[stage], a_cur[stage])\n",
    "            TOTALREWARD.append(REWARD)\n",
    "            s_cur[stage+1] = s_next\n",
    "            #print([t, stage, REWARD, s_cur[stage], a_cur[stage]])\n",
    "        for ite in range(Inner):\n",
    "            for stage in range(H):\n",
    "                curstage=H-1-stage\n",
    "                phi_v = phiv(s_cur[curstage], a_cur[curstage], VVALUE[:,curstage+1])\n",
    "                BB [:,curstage] = BB [:,curstage] + phi_v*VVALUE[s_cur[curstage+1], curstage+1]\n",
    "                phi_v.shape = (dim,1)\n",
    "                SIGMA[:,:,curstage]  = SIGMA[:,:,curstage] + np.dot(phi_v, phi_v.T)\n",
    "                hattheta = np.linalg.lstsq(SIGMA[:,:,curstage], BB[:,curstage], rcond=-1)\n",
    "                hattheta = hattheta[0]\n",
    "                hattheta.shape = (dim)\n",
    "                for stat in range(STATE):\n",
    "                    for act in range(ACTION):\n",
    "                        f = cp.Parameter(dim)\n",
    "                        x = cp.Variable(dim)\n",
    "                        constraints1 = [cp.norm(SIGMA[:,:,curstage]@(x-hattheta), 2)<=beta]                \n",
    "                        prob1 = cp.Problem(cp.Minimize(f.T@x),constraints1)\n",
    "                        phi_v = phiv(stat, act, VVALUE[:,curstage+1])\n",
    "                        f.value = -phi_v\n",
    "                        prob1.solve()         \n",
    "                        QVALUE[stat,act,curstage] =reward(stat,act)+ phi_v.T@x.value\n",
    "                        QVALUE[stat,act,curstage] = min(QVALUE[stat,act,curstage],stage+1) \n",
    "                        QVALUE[stat,act,curstage] = max (QVALUE[stat,act,curstage],0)\n",
    "                    VVALUE [stat, curstage]= np.dot(QVALUE [stat,:,curstage],  POLICY[stat,:,curstage].T)\n",
    "                    sum =0\n",
    "                    for act in range(ACTION):\n",
    "                        POLICY[stat,act,curstage] = np.exp (soft*QVALUE[stat,act,curstage])\n",
    "                        sum = sum + POLICY[stat,act,curstage]\n",
    "                    for act in range(ACTION):\n",
    "                        POLICY[stat,act,curstage] = POLICY[stat,act,curstage]/sum    \n",
    "\n",
    "    path  = \"OPPO_lowerbound_0.1_\" + str(jjjj) +\".txt\"\n",
    "    fr = open(path,'w')\n",
    "    for i in TOTALREWARD:\n",
    "        fr.write(str(i))\n",
    "        fr.write(\"\\n\")\n",
    "    fr.close() \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 244
    },
    "id": "quick-limitation",
    "outputId": "ef57d773-9133-48b6-af9f-1ce7187bd1da",
    "scrolled": true
   },
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "ignored",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-4-f9b258aae05b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     10\u001b[0m     \u001b[0mline1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     11\u001b[0m     \u001b[0mpath\u001b[0m  \u001b[0;34m=\u001b[0m \u001b[0;34m\"optimal_lowerbound_0.1_\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjjjj\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m\u001b[0;34m\".txt\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m     \u001b[0mfr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     13\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mfr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m         \u001b[0mrewardopt\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mline1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrewardopt\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mline1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'optimal_lowerbound_0.1_0.txt'"
     ]
    }
   ],
   "source": [
    "T1=H*T\n",
    "rewardopt = np.zeros(T1+1)\n",
    "rewardran = np.zeros(T1+1)\n",
    "rewardOPPO = np.zeros(T1+1)\n",
    "rewardOPPObern = np.zeros(T1+1)\n",
    "rewardliu = np.zeros(T1+1)\n",
    "rewardcur = 0\n",
    "\n",
    "for jjjj in range(repeat):\n",
    "    line1=0\n",
    "    path  = \"optimal_lowerbound_0.1_\" + str(jjjj) +\".txt\"\n",
    "    fr = open(path,'r')\n",
    "    for line in fr:\n",
    "        rewardopt[line1]=rewardopt[line1]+int(line)\n",
    "        line1+=1\n",
    "    fr.close() \n",
    "for i in range(T1):\n",
    "    rewardopt[i] = (rewardopt[i])/repeat\n",
    "x=list(range(1, T1))\n",
    "plt.plot(x, rewardopt[0:T1-1],color='orange', label='Optimal')\n",
    "    \n",
    "    \n",
    "for jjjj in range(repeat):\n",
    "    line1=0\n",
    "    path  = \"random_lowerbound_0.1_\" + str(jjjj) +\".txt\"\n",
    "    fr = open(path,'r')\n",
    "    for line in fr:\n",
    "        rewardran[line1]=rewardran[line1]+int(line)\n",
    "        line1+=1\n",
    "    fr.close() \n",
    "for i in range(T1):\n",
    "    rewardran[i] = rewardopt[i]-(rewardran[i])/repeat\n",
    "x=list(range(1, T+1))\n",
    "plt.plot(x, rewardran[0:T1-1:H],color='red', label='Random')\n",
    "\n",
    "\n",
    "\n",
    "for jjjj in range(repeat):\n",
    "    line1=0\n",
    "    path  = \"OPPO_lowerbound_0.1_\" + str(jjjj) +\".txt\"\n",
    "    fr = open(path,'r')\n",
    "    for line in fr:\n",
    "        rewardOPPO[line1]=rewardOPPO[line1]+int(line)\n",
    "        line1+=1\n",
    "    fr.close() \n",
    "for i in range(T1):\n",
    "    rewardOPPO[i] = rewardopt[i]-(rewardOPPO[i])/repeat\n",
    "x=list(range(1, T+1))\n",
    "plt.plot(x, rewardOPPO[0:T1-1:H],color='blue', label='OPERA')\n",
    "\n",
    "plt.legend()\n",
    "plt.xlabel('Number of Episodes')\n",
    "# naming the y axis\n",
    "plt.ylabel('Regret')\n",
    "# giving a title to my graph\n",
    "plt.savefig(os.path.join('0_pdf' + \"test.pdf\"), dpi=100)\n",
    "# function to show the plot\n",
    "plt.show()\n",
    "fig=plt.gcf()\n",
    "fig.savefig('./666.jpg')"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
