import numpy as np
from tqdm import trange
from model import UCB, TS, PHE, KL_EXP, SupLinUCB
from data import generate_contexts
import time, json, itertools, os

def eval_UCB(N, d, alpha_set=[0.001, 0.01, 0.1, 1], T=30000, M=10, rho=0.5, R=1, seed=0, output=False):
    results = []
    for alpha in alpha_set:
        cumul_regret = np.zeros((M,T))
        beta_err = np.zeros((M,T))
        elapsed_time = np.zeros((M,T))
        for m in range(M):
            print('UCB Simulation %d, N=%d, d=%d, alpha=%.3f' % (m+1, N, d, alpha))
            M_UCB = UCB(d=d, alpha=alpha)
            np.random.seed(seed+m)
            beta = np.random.uniform(-1/np.sqrt(d),1/np.sqrt(d),d)
            opt_reward, UCB_reward = [], []

            for t in trange(T):
                contexts = generate_contexts(N, d, rho, seed=seed+t+m)
                opt_reward.append(np.max(np.array(contexts) @ beta))
                start = time.time()
                a_t = M_UCB.select_ac(contexts)
                reward = np.dot(contexts[a_t],beta) + np.random.normal(0, R, size=1)
                UCB_reward.append(np.dot(contexts[a_t],beta))
                M_UCB.update(reward)
                elapsed_time[m,t] = time.time() - start
                beta_err[m,t] = np.linalg.norm(M_UCB.beta_hat-beta)

            cumul_regret[m,:] = np.cumsum(opt_reward)-np.cumsum(UCB_reward)

        results.append({'model':'UCB',
                        'settings':M_UCB.settings,
                        'regrets':cumul_regret.tolist(),
                        'beta_err':beta_err.tolist(),
                        'time':elapsed_time.tolist()})
    if output:
        last_regret = [np.mean(r['regrets'], axis=0)[-1] for r in results]
        return results[int(np.argmin(last_regret))]
    else:
        os.makedirs('./results', exist_ok=True)
        with open(f'./results/UCB_d{d}_N{N}.txt', 'w+') as outfile:
            json.dump(results, outfile)


def eval_TS(N, d, v_set=[0.001, 0.01, 0.1, 1], T=30000, M=10, rho=0.5, R=1, seed=0, output=False):
    results = []
    for v in v_set:
        cumul_regret = np.zeros((M,T))
        beta_err = np.zeros((M,T))
        elapsed_time = np.zeros((M,T))
        for m in range(M):
            print('TS Simulation %d, N=%d, d=%d, v=%.3f' % (m+1, N, d, v))
            M_TS = TS(d=d, v=v)
            np.random.seed(seed+m)
            beta = np.random.uniform(-1/np.sqrt(d),1/np.sqrt(d),d)
            opt_reward, TS_reward = [], []

            for t in trange(T):
                contexts = generate_contexts(N, d, rho, seed=seed+t+m)
                opt_reward.append(np.max(np.array(contexts) @ beta))
                start = time.time()
                a_t = M_TS.select_ac(contexts)
                reward = np.dot(contexts[a_t],beta) + np.random.normal(0, R, size=1)
                TS_reward.append(np.dot(contexts[a_t],beta))
                M_TS.update(reward)
                elapsed_time[m,t] = time.time() - start
                beta_err[m,t] = np.linalg.norm(M_TS.beta_hat-beta)

            cumul_regret[m,:] = np.cumsum(opt_reward)-np.cumsum(TS_reward)

        results.append({'model':'TS',
                        'settings':M_TS.settings,
                        'regrets':cumul_regret.tolist(),
                        'beta_err':beta_err.tolist(),
                        'time':elapsed_time.tolist()})
    if output:
        last_regret = [np.mean(r['regrets'], axis=0)[-1] for r in results]
        return results[int(np.argmin(last_regret))]
    else:
        os.makedirs('./results', exist_ok=True)
        with open(f'./results/TS_d{d}_N{N}.txt', 'w+') as outfile:
            json.dump(results, outfile)


def eval_PHE(N, d, alpha_set=[0.001, 0.01, 0.1, 1], T=30000, M=10, rho=0.5, R=1, seed=0, output=False):
    results = []
    for alpha in alpha_set:
        cumul_regret = np.zeros((M,T))
        beta_err = np.zeros((M,T))
        elapsed_time = np.zeros((M,T))
        for m in range(M):
            print('PHE Simulation %d, N=%d, d=%d, alpha=%.3f' % (m+1, N, d, alpha))
            M_PHE = PHE(d=d, alpha=alpha)
            np.random.seed(seed+m)
            beta = np.random.uniform(-1/np.sqrt(d),1/np.sqrt(d),d)
            opt_reward, PHE_reward = [], []

            for t in trange(T):
                contexts = generate_contexts(N, d, rho, seed=seed+t+m)
                opt_reward.append(np.max(np.array(contexts) @ beta))
                start = time.time()
                a_t = M_PHE.select_ac(contexts)
                reward = np.dot(contexts[a_t],beta) + np.random.normal(0, R, size=1)
                PHE_reward.append(np.dot(contexts[a_t],beta))
                M_PHE.update(reward)
                elapsed_time[m,t] = time.time() - start
                beta_err[m,t] = np.linalg.norm(M_PHE.beta_hat-beta)

            cumul_regret[m,:] = np.cumsum(opt_reward)-np.cumsum(PHE_reward)

        results.append({'model':'PHE',
                        'settings':M_PHE.settings,
                        'regrets':cumul_regret.tolist(),
                        'beta_err':beta_err.tolist(),
                        'time':elapsed_time.tolist()})
    if output:
        last_regret = [np.mean(r['regrets'], axis=0)[-1] for r in results]
        return results[int(np.argmin(last_regret))]
    else:
        os.makedirs('./results', exist_ok=True)
        with open(f'./results/PHE_d{d}_N{N}.txt', 'w+') as outfile:
            json.dump(results, outfile)


def eval_KLBandit(N, d, eta_set=[0.1, 0.01], T=30000, M=10, rho=0.5, R=1.0,
               seed=0, ref_policy='uniform', lam=1.0, output=False):
    results = []
    for eta in eta_set:
        cumul_regret = np.zeros((M, T))
        beta_err = np.zeros((M, T))
        elapsed_time = np.zeros((M, T))

        for m in range(M):
            print(f'KL-Bandit Simulation {m+1}, N={N}, d={d}, eta={eta:.3f}')
            M_KL = KL_EXP(d=d, eta=eta, ref_policy=ref_policy, lam=lam)
            np.random.seed(seed + m)
            beta = np.random.uniform(-1/np.sqrt(d), 1/np.sqrt(d), d)
            opt_reward, alg_reward = [], []

            for t in trange(T):
                contexts = generate_contexts(N, d, rho, seed=seed + t + m)
                opt_reward.append(np.max(np.asarray(contexts) @ beta))
                start = time.time()
                a_t = M_KL.select_ac(contexts)
                mean_rt = float(np.dot(contexts[a_t], beta))
                reward = mean_rt + np.random.normal(0.0, R, size=1)
                alg_reward.append(mean_rt)
                M_KL.update(reward)
                elapsed_time[m, t] = time.time() - start
                beta_err[m, t] = np.linalg.norm(M_KL.beta_hat - beta)

            cumul_regret[m, :] = np.cumsum(opt_reward) - np.cumsum(alg_reward)

        results.append({
            'model': 'KL-Bandit',
            'settings': M_KL.settings,
            'regrets': cumul_regret.tolist(),
            'beta_err': beta_err.tolist(),
            'time': elapsed_time.tolist()
        })

    if output:
        last_regret = [np.mean(res['regrets'], axis=0)[-1] for res in results]
        return results[int(np.argmin(last_regret))]
    else:
        os.makedirs('./results', exist_ok=True)
        with open(f'./results/KLBandit_d{d}_N{N}.txt', 'w+') as outfile:
            json.dump(results, outfile)


def eval_SupLinUCB(N, d, alpha_set=[0.001, 0.01, 0.1, 1], T=30000, M=10, rho=0.5, R=1, seed=0, output=False):
    #evaluate SupLinUCB
    #inputs: M, N, d, T, rho, seed
    results = []
    for alpha in alpha_set:
        cumul_regret = np.zeros((M,T))
        beta_err = np.zeros((M,T))
        elapsed_time = np.zeros((M,T))
        for m in range(M):
            print('SupLinUCB Simulation %d, N=%d, d=%d, alpha=%.3f' % (m+1, N, d, alpha))
            # call model
            M_UCB = SupLinUCB(d=d, alpha=alpha, T=T)
            # true beta
            np.random.seed(seed+m)
            #beta = np.random.uniform(-1,1,d)
            beta = np.random.uniform(-1/np.sqrt(d),1/np.sqrt(d),d)
            opt_reward = []
            UCB_reward = []

            for t in trange(T):
                # generate contexts
                contexts = generate_contexts(N, d, rho, seed=seed+t+m)
                # optimal reward
                opt_reward.append(np.amax(np.array(contexts) @ beta))
                # time
                start = time.time()
                a_t = M_UCB.select_ac(contexts)
                reward = np.dot(contexts[a_t],beta) + np.random.normal(0, R, size=1)
                UCB_reward.append(np.dot(contexts[a_t],beta))
                M_UCB.update(reward)
                elapsed_time[m,t] = time.time() - start
                #beta_err[m,t] = np.linalg.norm(M_UCB.beta_hat-beta)

            cumul_regret[m,:] = np.cumsum(opt_reward)-np.cumsum(UCB_reward)
        ##Save at dict
        results.append({'model':'SupLinUCB',
                        'settings':M_UCB.settings,
                        'regrets':cumul_regret.tolist(),
                        'time':elapsed_time.tolist()})

    if output:
        last_regret = [np.mean(r['regrets'], axis=0)[-1] for r in results]
        return results[int(np.argmin(last_regret))]
    else:
        os.makedirs('./results', exist_ok=True)
        with open(f'./results/SupLinUCB_d{d}_N{N}.txt', 'w+') as outfile:
            json.dump(results, outfile)
