# -*- coding utf-8 -*-
# LSVI_UCB_main.py

# use the LSVI_UCB

import numpy as np
import argparse

from utils import RandomSimplexVector
from Env import FiniteStateFiniteActionLinearMDP
from SafeLSVE import safeTrainer

parser = argparse.ArgumentParser(description='Run LSVI-UCB algorithm')
parser.add_argument('-H', type=int, help='the length of horizon')
parser.add_argument('-S', type=int, help='the number of states')
parser.add_argument('-A', type=int, help='the number of actions')
parser.add_argument('-d', type=int, help='the number of dimensions of feature')
parser.add_argument('-env', type=str, help='the name of environment')
parser.add_argument('-N', type=int, help='the number of total epochs')
parser.add_argument('-M', type=int, help='the number of total trials')
args = parser.parse_args()

env = FiniteStateFiniteActionLinearMDP(H=args.H, S=args.S, A=args.A, d=args.d)
env.load_env(args.env)

b, actions = env.best_gen()
print('optimal policy average total rewards: ', b)

for j in range(args.M):
    safe_trainer = safeTrainer(env, baseline=(b, actions))
    safe_trainer.beta_ucb = 1
    hist = []
    h0l = []
    for i in range(args.N):
        # 3 UCB
        r = safe_trainer.epoch_train(algo = 'UCB')
        hist.append(r)
    np.save('Results/hist_UCB_' + str(j) + '.npy', hist)