import numpy as np

def test_policy(seed, n_experiments, T, best_arm_value, env, agent):
    np.random.seed(seed)
    regret = np.zeros((n_experiments, T))

    for exp in range(n_experiments):
        env.reset()
        agent.reset()

        #Initialization of the Learning Procedure

        #Learning Procedure
        for t in range(1, T):
            arm = agent.pull_arm()
            first, reward, mean_reward = env.round(arm)
            agent.update(first)
            regret[exp, t] = best_arm_value-mean_reward

    return regret
            

