{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"../\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:\n",
      "The TensorFlow contrib module will not be included in TensorFlow 2.0.\n",
      "For more information, please see:\n",
      "  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n",
      "  * https://github.com/tensorflow/addons\n",
      "  * https://github.com/tensorflow/io (for I/O related ops)\n",
      "If you depend on functionality not listed there, please file an issue.\n",
      "\n",
      "WARNING:tensorflow:From /home/ggao5/OPE_Augment/LSTM_VAE_MUJOCO/LSTM_VAE_pen-human-v1/utils_latentPolicy_sac_lstm_zt_zt1.py:533: The name tf.AUTO_REUSE is deprecated. Please use tf.compat.v1.AUTO_REUSE instead.\n",
      "\n",
      "WARNING:tensorflow:From /tmp/ipykernel_25501/411476886.py:23: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning: Flow failed to import. Set the environment variable D4RL_SUPPRESS_IMPORT_ERROR=1 to suppress this message.\n",
      "No module named 'flow'\n",
      "/home/ggao5/anaconda3/envs/ope_py37/lib/python3.7/site-packages/glfw/__init__.py:906: GLFWError: (65544) b'X11: The DISPLAY environment variable is missing'\n",
      "  warnings.warn(message, GLFWError)\n",
      "Warning: CARLA failed to import. Set the environment variable D4RL_SUPPRESS_IMPORT_ERROR=1 to suppress this message.\n",
      "No module named 'carla'\n",
      "pybullet build time: Dec  1 2021 18:33:04\n",
      "WARNING:tensorflow:From /tmp/ipykernel_25501/411476886.py:23: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# from __future__ import division\n",
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "from collections import deque\n",
    "import random\n",
    "import gym\n",
    "from gym import wrappers\n",
    "from gym.envs.classic_control.pendulum import angle_normalize, PendulumEnv\n",
    "from core import *\n",
    "from utils_latentPolicy_sac_lstm_zt_zt1 import *\n",
    "import os\n",
    "import tensorflow_probability as tfp\n",
    "import multiprocessing as mp\n",
    "import os\n",
    "import d4rl\n",
    "import json\n",
    "import pandas as pd\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n",
    "\n",
    "slim = tf.contrib.slim\n",
    "rnn = tf.contrib.rnn\n",
    "tfd = tfp.distributions\n",
    "config=tf.ConfigProto(log_device_placement=False)\n",
    "config.gpu_options.allow_growth = True\n",
    "\n",
    "env_name = 'pen-human-v1'\n",
    "# use normalized pattern-seg\n",
    "with open('./processed_data/train_pattern.npy', 'rb') as f: \n",
    "    DATA = np.load(f, allow_pickle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "def evaluate(args):\n",
    "    \n",
    "    def learn_dist_from_s(state, code_size, reuse=tf.AUTO_REUSE, is_training=True, var_scope=\"BC\"):\n",
    "        with tf.variable_scope(var_scope, reuse=reuse) as scope:\n",
    "            with slim.arg_scope([slim.fully_connected], \n",
    "                                    activation_fn=tf.nn.relu,\n",
    "                                    weights_initializer=tf.glorot_uniform_initializer,\n",
    "                                    weights_regularizer=slim.l2_regularizer(0.001),\n",
    "                                    biases_regularizer=slim.l2_regularizer(0.001),\n",
    "                                    normalizer_fn = slim.batch_norm,\n",
    "                                    normalizer_params = {\"is_training\": is_training},\n",
    "                                    reuse = reuse,\n",
    "                                    scope = scope):\n",
    "                # is_training = False for evaluation\n",
    "                x = slim.fully_connected(state, 128, scope=\"fc1\")\n",
    "                x = slim.fully_connected(x, 64, scope=\"fc2\")\n",
    "                loc = slim.fully_connected(x, code_size, activation_fn=None, scope=\"loc\")\n",
    "                scale =slim.fully_connected(x, code_size, activation_fn=tf.nn.softplus, scope=\"scale\")\n",
    "    #             dist = tfd.MultivariateNormalDiag(loc, scale)\n",
    "                out_sample = tfd.TruncatedNormal(loc, scale, -1., 1.).sample() # -1, 1 bound\n",
    "                out_log_prob = trun_normal_log_prob(action_holder, loc, scale, -1., 1.)\n",
    "                return out_sample, out_log_prob\n",
    "        \n",
    "    def trun_normal_log_prob(x, mu, std, low, high):\n",
    "        z = tfd.Normal(0,1).cdf((high-x)/(std+EPS)) - tfd.Normal(0,1).cdf((low-x)/(std+EPS))\n",
    "        return tf.reduce_sum(-0.5*((x - mu) / (std+EPS))**2 - 0.5*tf.log(2*np.pi) - tf.log(std*z), axis=1, name=\"log_prob\")\n",
    "    \n",
    "    ope_path = args\n",
    "    \n",
    "#     try:\n",
    "\n",
    "    LR = 0.0003\n",
    "    GAMMA = .995\n",
    "    BUFFER_SIZE_SAC = 2*10**6\n",
    "    MINIBATCH_SIZE_SAC = 256\n",
    "    MINIBATCH_SIZE_OPE = 4\n",
    "    RANDOM_SEED = 2599\n",
    "    MAX_EPISODES = 2000\n",
    "    MAX_EPISODE_LEN = len(seg[0]['observations'])\n",
    "    NUM_OPE_MODELS = 1\n",
    "    CODE_SIZE = 16\n",
    "    EXPLORATION = .3\n",
    "    REPEAT = 1\n",
    "    BUFFER_SIZE_OPE = 3000\n",
    "    beta = 1.\n",
    "\n",
    "\n",
    "    OPE_LR = 0.001\n",
    "    OPE_DS = 1000\n",
    "    OPE_DR = 0.98\n",
    "    \n",
    "    EPS = 1e-8\n",
    "\n",
    "    BEST_MAE = 9999.\n",
    "    vae_seg = []\n",
    "\n",
    "    network_params = {\n",
    "    'hidden_sizes':[256, 256],\n",
    "    'activation':'relu',\n",
    "    'policy':mlp_gaussian_policy\n",
    "    }\n",
    "\n",
    "    rl_params = {\n",
    "        'env_name':env_name,\n",
    "\n",
    "        # control params\n",
    "        'seed': RANDOM_SEED,\n",
    "        'epochs': MAX_EPISODES,\n",
    "        'actor_critic':mlp_actor_critic,\n",
    "        'steps_per_epoch': MAX_EPISODE_LEN,\n",
    "        'replay_size': BUFFER_SIZE_SAC,\n",
    "        'batch_size': MINIBATCH_SIZE_SAC,\n",
    "        'start_epis': 0,\n",
    "        'max_ep_len': MAX_EPISODE_LEN,\n",
    "        'save_freq': 10,\n",
    "        'render': False,\n",
    "\n",
    "        # rl params\n",
    "        'gamma': 0.99,\n",
    "        'polyak': 0.995,\n",
    "        'lr': LR,\n",
    "        'grad_clip_val':None,\n",
    "\n",
    "        # entropy params\n",
    "        'alpha': 'auto',\n",
    "        'target_entropy':'auto' # fixed or auto define with -act_dim\n",
    "    }\n",
    "\n",
    "    file_appendix = (\n",
    "        \"lstm_vae_\" + rl_params['env_name'] + \"_\" + str(MAX_EPISODES)\n",
    "        + \"epi_repeat\"+ str(REPEAT) + \"_\" + str(LR) + \"_\"\n",
    "        + str(OPE_LR) + \"_\"\n",
    "        + str(OPE_DS) + \"_\"\n",
    "        + str(OPE_DR) + \"_\"\n",
    "        + str(CODE_SIZE) + \"_\"\n",
    "        + str(beta) + \"_\"\n",
    "        + str(RANDOM_SEED)\n",
    "    )\n",
    "\n",
    "#     env = gym.make(rl_params['env_name'])\n",
    "    np.random.seed(RANDOM_SEED)\n",
    "    tf.set_random_seed(RANDOM_SEED)\n",
    "#     env.seed(RANDOM_SEED)\n",
    "\n",
    "    env_state_dim = DATA[0]['observations'].shape[1] # NEED MOD\n",
    "    # state_dim = CODE_SIZE\n",
    "    env_action_dim = DATA[0]['actions'].shape[1] # NEED MOD\n",
    "    # get mean and std\n",
    "    ob = [i for u in DATA for j in u['observations'] for i in j]\n",
    "    OBS_MEAN = sum(ob)/len(ob)\n",
    "    OBS_STD = np.std(ob)\n",
    "\n",
    "    rw = [j for u in DATA for j in u['rewards']]\n",
    "    REW_MEAN = sum(rw)/len(rw)\n",
    "    REW_STD = np.std(rw)\n",
    "    env_action_bound = None\n",
    "    env_state_bound = None\n",
    "    # Ensure action bound is symmetric\n",
    "#     assert (env.action_space.high == -env.action_space.low)\n",
    "\n",
    "    graph_ope_models = tf.Graph()\n",
    "\n",
    "    graph_ac = tf.Graph()\n",
    "    \n",
    "    graph_behavior = tf.Graph()\n",
    "    \n",
    "    with tf.Session(config=config, graph=graph_behavior) as sess_behavior:\n",
    "\n",
    "        with tf.Session(config=config, graph=graph_ope_models) as sess_ope_models:\n",
    "\n",
    "            with graph_ope_models.as_default():\n",
    "\n",
    "                ope_model = OPE_Model(\n",
    "                    graph_ope_models, sess_ope_models, OPE_LR, OPE_DS, OPE_DR, CODE_SIZE,\n",
    "                    env_state_dim, env_state_bound, env_action_dim, file_appendix,\n",
    "                    BUFFER_SIZE_OPE, RANDOM_SEED, MINIBATCH_SIZE_OPE, MAX_EPISODE_LEN, beta,\n",
    "                    is_training=False\n",
    "                )\n",
    "\n",
    "                ope_saver = ope_model.saver\n",
    "\n",
    "                ope_saver.restore(sess_ope_models, ope_path)\n",
    "\n",
    "\n",
    "    #             d4rl_qlearning = d4rl.qlearning_dataset(env)\n",
    "\n",
    "                obs_mean = OBS_MEAN\n",
    "                obs_std = OBS_STD\n",
    "\n",
    "                rew_mean = REW_MEAN\n",
    "                rew_std = REW_STD\n",
    "\n",
    "                class LearnedEnv(object):\n",
    "                    def __init__(self, model):\n",
    "\n",
    "                        self.model = model\n",
    "\n",
    "                    def reset(self):\n",
    "                        self.model.init_z0_s0()\n",
    "                        s0 = self.model.sess.run(self.model.decoder_state_sample, \n",
    "                                           feed_dict={self.model.decoder_zt_holder:self.model.zt}).reshape(-1)\n",
    "\n",
    "                        self.obs = s0\n",
    "                        return s0\n",
    "\n",
    "                    def step(self, u):\n",
    "                        new_obs, reward = self.model.get_zt1_s2_r(np.reshape(u, (1, env_action_dim)))\n",
    "                        self.obs = new_obs\n",
    "                        self.model.update_zt()\n",
    "\n",
    "                        return new_obs, reward, False, {}\n",
    "\n",
    "                learned_env = LearnedEnv(ope_model)\n",
    "\n",
    "                np.random.seed(RANDOM_SEED)\n",
    "                tf.set_random_seed(RANDOM_SEED)\n",
    "\n",
    "                ep_rewards = []\n",
    "    #             policy = D4RL_Policy(target_policy_path)\n",
    "                \n",
    "        \n",
    "                # read learnt behavior\n",
    "                with graph_behavior.as_default():\n",
    "                    state_holder = tf.placeholder(shape=[None, env_state_dim], dtype=tf.float32, name='state_holder')\n",
    "                    action_holder = tf.placeholder(shape=[None, env_action_dim], dtype=tf.float32, name='action_holder')\n",
    "                    _learn_dist_from_s = learn_dist_from_s(state_holder, env_action_dim, reuse=tf.AUTO_REUSE, is_training=False)\n",
    "\n",
    "                    #First let's load meta graph and restore weights\n",
    "                    behavior_saver = tf.train.Saver()\n",
    "                    behavior_saver.restore(sess_behavior, './saved_dist/state_action_dist.ckpt')\n",
    "\n",
    "                for i in range(50):\n",
    "\n",
    "                    terminal = 0\n",
    "                    user_seg = {'observations':[],'actions':[],'rewards':[],'next_observations':[]}\n",
    "\n",
    "                    s = learned_env.reset()\n",
    "                    s = s.reshape(env_state_dim)*obs_std + obs_mean\n",
    "                    ep_reward = 0\n",
    "\n",
    "                    for j in range(MAX_EPISODE_LEN):\n",
    "                        user_seg['observations'].append(s)\n",
    "                        if j % REPEAT == 0:\n",
    "    #                         a, _ = policy.act(np.reshape(s, (env_state_dim,)), np.zeros((env_action_dim,)))\n",
    "#                             a = np.eye(env_action_dim)[np.random.choice(env_action_dim, 1)] # NEED MOD, OLD VERSION WITH RANDOM POLICY                            \n",
    "                            feed_dict={state_holder : [s],} # two dimension [[]]\n",
    "                            a = sess_behavior.run(_learn_dist_from_s[0], feed_dict=feed_dict)[0]\n",
    "\n",
    "                        s2, r, terminal, info = learned_env.step(a)\n",
    "                        r = r*rew_std + rew_mean\n",
    "                        s2 = s2.reshape(env_state_dim)*obs_std + obs_mean\n",
    "\n",
    "\n",
    "                        ep_reward += r*(GAMMA**j)\n",
    "\n",
    "                        s = s2\n",
    "\n",
    "    #                     if terminal or j == MAX_EPISODE_LEN-1:\n",
    "    #                         ep_rewards += [ep_reward]\n",
    "\n",
    "    #                         break\n",
    "                        user_seg['next_observations'].append(s2)\n",
    "                        user_seg['rewards'].append(r)\n",
    "                        user_seg['actions'].append(a)\n",
    "\n",
    "                    vae_seg.append(user_seg)\n",
    "                with open('./saved_augmented_data/'+ope_path.replace('./saved_model/', '').replace('/aug_best.ckpt', '')+'_augmented_segment.npy', 'wb') as f:\n",
    "                    np.save(f, vae_seg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "OPEs = [\"./saved_model/\"+i+\"/aug_best.ckpt\" for i in os.listdir(\"./saved_model/\") ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "pool = mp.Pool(3)\n",
    "pool.map(evaluate, [o_path for o_path in OPEs])\n",
    "pool.close()\n",
    "pool.join()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
