# Regret Bounds for Satisficing in Multi-Armed Bandit Problems
#
# This file contains the code for two classes of bandits (Bernoulli rewards and Gaussian rewards)
# and the algorithms considered in the paper
#
# The bandits take parameters to initialize randomly their reward distributions
# but this behaviour can be overwritten by specifying directly the desired distributions for the arms
#
# The algorithms are designed to run a simulation once from scratch and for a given number of steps
# Each function takes as input a bandit instance, a satisfaction level and a number of step
# In addition, an algorithm specific parameter can be passed to tune the algorithm
# The function returns different metrics, namely the reward at each step,
# the (pseudo-)regret at each step (either S-regret or standard regret depending on weither their exists a satisfying arm or not)
# and expected reward at each step based on the chosen arm
# The algorithm are self contained on purpose so they can be copied to another script easily if needed
#
# The experiment function can be used to run the algorithm multiple times and/or on different bandits instances
# and return the average of the metrics
#
# The variable below can be set to an integer value in order to run all the experiments with the same seed
# The value seed=1 allows to produce results similar to the ones presented in the paper
seed = 1  # None
# This seed is used to reset the random number generator before each new experiment

import setting
import numpy as np
import sys
from scipy.stats import norm
import time
import copy
import matplotlib.pyplot as plt
import seaborn as sns


# Simple CLI progressbar that do not need a particular library
def progressbar(it, prefix="", size=60, file=sys.stdout):
    count = len(it)

    def show(j):
        x = int(size * j / count)
        file.write("%s[%s%s] %i/%i\r" % (prefix, "#" * x, "." * (size - x), j, count))
        file.flush()

    show(0)
    for i, item in enumerate(it):
        yield item
        show(i + 1)
    file.write("\n")
    file.flush()


# Parent class for bandits
# Arms always return the same values
#
# If the means are unspecified, they are chosen uniformly in the given range
class Bandit:
    def __init__(self, nb_arm, range=[0, 1], means=None):
        self.nb_arm = nb_arm
        if means == None:
            self.means = np.random.rand(nb_arm) * (range[1] - range[0]) + range[0]
        else:
            self.means = means
        # print(self.means)

    def pull(self, arm):
        return self.means[arm]


# Bandit with Gaussian distributed rewards
# If unspecified, the standard deviations are set to 1 by default
class GaussianBandit(Bandit):
    def __init__(self, nb_arm, range=[0, 1], means=None, sigmas=None):
        super().__init__(nb_arm, range=range, means=means)
        if sigmas != None:
            self.sigmas = sigmas
        else:
            self.sigmas = np.ones(nb_arm)

    def pull(self, arm):
        return np.random.randn() * self.sigmas[arm] + self.means[arm]


# Bandit with Bernoulli distributed rewards
class BernoulliBandit(Bandit):
    def __init__(self, nb_arm, range=[0, 1], threshold=0.7, means=None):
        super().__init__(nb_arm, range=range, means=means)

    def pull(self, arm):
        if np.random.rand() < self.means[arm]:
            return 1.0
        else:
            return 0.0


########################################################################
# Run a sequence of experiments using the specified algorithm and return the average of the metrics (rewards,regret and expected rewards)
# algo : Algorithm (function) used for the experiments
# bandits : A list of bandit instances (possibly only one) on which we want to run the algorithm
# satisfaction_level : Satisfaction level
# nb_step : Duration of each run
# nb_repetion : Number of repetition of the experiment on each instance in the parameter `bandit`
# parameter : Optional parameter for the algorithm
# error_type : Type of error bar to use for the figure (standard deviation or quantile)
def experiment(
    algo,
    bandits,
    satisfaction_level,
    nb_step,
    nb_repetition=1,
    parameter=1,
    error_type="quantile",
):
    if seed != None:
        np.random.seed(seed)
    print(f"Experiment ({nb_repetition} runs)")
    rewards, regrets, expectations = [], [], []
    ## setting.val_prior is variance for prior where here is un-informed
    setting.val_prior = 0
    for bandit in bandits:  # In case we want to average over multiple bandit instances
        for _ in progressbar(
            range(nb_repetition)
        ):  # Repetition of the experiment on the bandit instance
            setting.val_prior += 1
            rew, reg, exp = algo(
                bandit, satisfaction_level, nb_step, parameter=parameter
            )
            # print("sigma 0", setting.val_prior)
            rewards.append(rew)
            regrets.append(reg)
            expectations.append(exp)
    if error_type == "std":
        error_bar_figure = np.std(np.cumsum(regrets, axis=1), axis=0)
    elif error_type == "quantile":
        error_bar_figure = np.quantile(
            np.cumsum(regrets, axis=1), q=[0.25, 0.75], axis=0
        )

    return (
        np.mean(rewards, 0),
        np.mean(regrets, 0),
        np.mean(expectations, 0),
        error_bar_figure,
    )  # Averaging the results


# Implementation of UCB1
# The optional parameters corresponds to a scaling factor for the confidence intervals
def ucb(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm

    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    for i in range(nb_arm, nb_step):
        # Compute confidences interval and indices
        R = confidence_multiplier * np.sqrt(2 * np.log(1 + i * (np.log(i) ** 2)))
        confidence = np.ones(nb_arm) * R / np.sqrt(nb_pull)
        emp_avg = arm_rewards / nb_pull
        ucb = emp_avg + confidence

        chosen_arm = np.argmax(ucb)

        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    return rewards, regrets, expectations


def algo1(bandit, satisfaction_level, nb_step, parameter=1):
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm

    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    for i in range(nb_arm, nb_step):
        emp_avg = arm_rewards / nb_pull
        best_arm = np.argmax(emp_avg)
        if emp_avg[best_arm] >= satisfaction_level:
            chosen_arm = best_arm
        else:
            chosen_arm = np.random.randint(nb_arm)

        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    return rewards, regrets, expectations


def algoroundrobin(bandit, satisfaction_level, nb_step, parameter=1):
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm

    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    do_rr = True
    rr_cur = 0
    for i in range(nb_arm, nb_step):
        emp_avg = arm_rewards / nb_pull
        best_arm = np.argmax(emp_avg)
        if emp_avg[best_arm] >= satisfaction_level:
            chosen_arm = best_arm
        else:
            do_rr = True

        if do_rr:
            chosen_arm = rr_cur
            rr_cur = rr_cur + 1
            if rr_cur >= nb_arm:
                do_rr = False
                rr_cur = 0
        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    return rewards, regrets, expectations


def algo1xucb(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm

    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    for i in range(nb_arm, nb_step):
        R = confidence_multiplier * np.sqrt(2 * np.log(1 + i * (np.log(i) ** 2)))
        confidence = np.ones(nb_arm) * R / np.sqrt(nb_pull)
        emp_avg = arm_rewards / nb_pull
        ucb = emp_avg + confidence
        best_arm = np.argmax(emp_avg)
        max_ucb_arm = np.argmax(ucb)
        if emp_avg[best_arm] >= satisfaction_level:
            chosen_arm = best_arm
        else:
            chosen_arm = max_ucb_arm

        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    return rewards, regrets, expectations


def algo3_old(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm
    ####
    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    ###
    for i in range(nb_arm, nb_step):
        # Compute confidences interval and indices
        R = confidence_multiplier * np.sqrt(2 * np.log(1 + i * (np.log(i) ** 2)))
        confidence = np.ones(nb_arm) * R / np.sqrt(nb_pull)
        emp_avg = arm_rewards / nb_pull
        ucb = emp_avg + confidence
        lcb = emp_avg - confidence
        ratio = (ucb - np.maximum(lcb, satisfaction_level)) / confidence
        max_ucb_arm = np.argmax(ucb)
        if ucb[max_ucb_arm] >= satisfaction_level:
            chosen_arm = np.argmax(ratio)
        else:
            chosen_arm = max_ucb_arm
        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    return rewards, regrets, expectations


# Implementation of Algorithm 3
# The optional parameters corresponds to a scaling factor for the confidence intervals
def algo3(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm

    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    for i in range(nb_arm, nb_step):
        # Compute confidences interval and indices
        R = confidence_multiplier * np.sqrt(2 * np.log(1 + i * (np.log(i) ** 2)))
        confidence = np.ones(nb_arm) * R / np.sqrt(nb_pull)
        emp_avg = arm_rewards / nb_pull
        ucb = emp_avg + confidence
        lcb = emp_avg - confidence
        ratio = (ucb - np.maximum(lcb, satisfaction_level)) / confidence

        max_ucb_arm = np.argmax(ucb)
        max_avg_arm = np.argmax(emp_avg)
        if emp_avg[max_avg_arm] >= satisfaction_level:
            chosen_arm = np.argmax(ratio)
        elif ucb[max_ucb_arm] >= satisfaction_level:
            indices = np.where(ucb >= satisfaction_level)[0]
            chosen_arm = np.random.choice(indices)
        else:
            chosen_arm = max_ucb_arm

        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    return rewards, regrets, expectations


# Implementation of Algorithm New
# The optional parameters corresponds to a scaling factor for the confidence intervals
def algo3new(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm

    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    # setting.threshold_sat=0.25
    ind_cons = 1
    condition_lu = True
    for i in range(nb_arm, nb_step):
        # Compute confidences interval and indices
        R = confidence_multiplier * np.sqrt(2 * np.log(1 + i * (np.log(i) ** 2)))
        confidence = np.ones(nb_arm) * R / np.sqrt(nb_pull)
        emp_avg = arm_rewards / nb_pull
        ucb = emp_avg + confidence
        lcb = emp_avg - confidence
        sort_arm_rewards = np.sort(arm_rewards / nb_pull)
        # setting.threshold_sat=sort_ucb_rewards[0]
        ##################################################################
        #### Change based on constant numbers
        # if np.any(lcb > setting.threshold_sat):
        #     # vect=setting.threshold_sat*np.ones(nb_arm)
        #     # vect=np.where(lcb > setting.threshold_sat, lcb, vect)
        #     # setting.threshold_sat=np.max(vect)+0.0001
        #     setting.threshold_sat+=((1/2)**ind_cons)
        #     ind_cons+=1
        # if np.all(setting.threshold_sat> ucb):
        #     # setting.threshold_sat=np.max(ucb)-0.0001
        #     setting.threshold_sat-=((1/2)**ind_cons)
        #     ind_cons+=1
        #### Change based on constant numbers
        while condition_lu:
            condition_lu = False
            if np.any(lcb > setting.threshold_sat):
                ind_cons += 1
                setting.threshold_sat += 0.5**ind_cons
            if np.all(setting.threshold_sat > ucb):
                condition_lu = True
                setting.threshold_sat -= 0.5 ** (ind_cons - 1)
                ind_cons += 1
        condition_lu = True
        ######################################################################
        max_ucb_arm = np.argmax(ucb)
        max_lcb_arm = np.argmax(lcb)
        # min_lcb_arm = np.argmin(lcb)
        # setting.threshold_sat= lcb[max_lcb_arm]
        ratio = (ucb - setting.threshold_sat) / (setting.threshold_sat - lcb)
        # max_avg_arm = np.argmax(emp_avg)
        if lcb[max_lcb_arm] >= setting.threshold_sat:
            chosen_arm = max_lcb_arm
        elif ucb[max_ucb_arm] >= setting.threshold_sat:
            # indices = np.where(ucb >= setting.threshold_sat)[0]
            chosen_arm = np.argmax(ratio)
        else:
            chosen_arm = np.argmax(ucb)
        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    return rewards, regrets, expectations


# Implementation of Algorithm 1 (Variant with UCB instead of ratio index)
# The optional parameters corresponds to a scaling factor for the confidence intervals
def algo3xucb(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm

    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    for i in range(nb_arm, nb_step):
        # Compute confidences interval and indices
        R = confidence_multiplier * np.sqrt(2 * np.log(1 + i * (np.log(i) ** 2)))
        confidence = np.ones(nb_arm) * R / np.sqrt(nb_pull)
        emp_avg = arm_rewards / nb_pull
        ucb = emp_avg + confidence

        max_ucb_arm = np.argmax(ucb)
        m = satisfaction_level
        mi = -1
        for i in range(
            ucb.shape[0]
        ):  # Find arm with highest ucb among the one with average reward > satisfaction level
            if ucb[i] >= m and emp_avg[i] >= satisfaction_level:
                m = ucb[i]
                mi = i
        if mi != -1:
            chosen_arm = mi
        elif ucb[max_ucb_arm] >= satisfaction_level:
            indices = np.where(ucb >= satisfaction_level)[0]
            chosen_arm = np.random.choice(indices)
        else:
            chosen_arm = max_ucb_arm

        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    return rewards, regrets, expectations


# Implementation of Algorithm 1 (Variant using empirical average reward instead of ratio index)
# The optional parameters corresponds to a scaling factor for the confidence intervals
def algo3xavg(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm

    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        # regrets.append(max(0,satisfaction_level-bandit.means[chosen_arm]))
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    for i in range(nb_arm, nb_step):
        # Compute confidences interval and indices
        R = confidence_multiplier * np.sqrt(2 * np.log(1 + i * (np.log(i) ** 2)))
        confidence = np.ones(nb_arm) * R / np.sqrt(nb_pull)
        emp_avg = arm_rewards / nb_pull
        ucb = emp_avg + confidence

        max_ucb_arm = np.argmax(ucb)
        max_avg_arm = np.argmax(emp_avg)
        if emp_avg[max_avg_arm] >= satisfaction_level:
            chosen_arm = np.argmax(emp_avg)
        elif ucb[max_ucb_arm] >= satisfaction_level:
            indices = np.where(ucb >= satisfaction_level)[0]
            chosen_arm = np.random.choice(indices)
        else:
            chosen_arm = max_ucb_arm

        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    return rewards, regrets, expectations


# Implementation of Thompson sampling for Bernoulli bandits
# The optional parameter is unused
def bernoulli_thompson(bandit, satisfaction_level, nb_step, parameter=1):
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm
    setting.alpha = np.ones(nb_arm)
    beta = np.ones(nb_arm)

    for i in range(nb_step):
        theta = np.random.beta(setting.alpha, beta)
        chosen_arm = np.argmax(theta)

        # Get reward
        reward = bandit.pull(chosen_arm)
        # Update distriutions
        setting.alpha[chosen_arm] += reward
        beta[chosen_arm] += 1 - reward
        # Update metrics
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    return rewards, regrets, expectations


# Implementation of epsilon-greedy algorithm (epsilon_n=cK/(n^2) where K is the number of arms)
# The optional parameter corresponds to the parameter c
def epsilon_greedy(bandit, satisfaction_level, nb_step, parameter=1):
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    epsilon = 1
    c = parameter
    d = 1
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm

    for i in range(nb_step):
        epsilon = min(1, c * nb_arm / (d * d * (i + 1)))
        if np.random.uniform(0, 1) > epsilon:
            chosen_arm = np.argmax(arm_rewards / np.maximum(nb_pull, 1))
        else:
            chosen_arm = np.random.randint(0, nb_arm)

        # Get reward
        reward = bandit.pull(chosen_arm)
        # Update metrics
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    return rewards, regrets, expectations


# Implementation of UCB setting.alpha (when it does not eliminate the bad arms)
# The optional parameters corresponds to a scaling factor for the confidence intervals
def ucb_alpha_no_elimination(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm

    for i in range(2 * nb_arm):  # Play each arm twice
        if i >= nb_step:
            break
        chosen_arm = i % nb_arm
        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    # The confidence ray for each arm
    epsilon_arm = np.sqrt(
        (2 / nb_pull) * (np.log((3 * np.log(nb_pull) ** 2 / setting.delta_conf_level)))
    )

    for i in range(2 * nb_arm, nb_step):
        # Compute confidences interval and indices
        chosen_arm = np.argmax(
            (arm_rewards / nb_pull)
            + (confidence_multiplier * setting.alpha * epsilon_arm)
        )
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        epsilon_arm[chosen_arm] = np.sqrt(
            (2 / nb_pull[chosen_arm])
            * (
                np.log(
                    (3 * np.log(nb_pull[chosen_arm]) ** 2 / setting.delta_conf_level)
                )
            )
        )
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    return rewards, regrets, expectations


########################################################################
# Implementation of UCB setting.alpha (when it eliminates the bad arms)
# The optional parameters corresponds to a scaling factor for the confidence intervals
# In the following algorithm, we eliminate the bad arm i if for the current best arm we have its lower # confidence is greater then upper confidence bound of arm i: mu^*-epsilon^* > mu^i+epsilon^i
def ucb_alpha_elimination(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm

    for i in range(2 * nb_arm):  # Play each arm twice
        if i >= nb_step:
            break
        chosen_arm = i % nb_arm
        # Update rewards and metrics
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    # The confidence ray for each arm
    epsilon_arm = np.sqrt(
        (2 / nb_pull) * (np.log((3 * np.log(nb_pull) ** 2 / setting.delta_conf_level)))
    )

    for i in range(2 * nb_arm, nb_step):
        # Compute confidences interval and indices
        chosen_arm = np.argmax(
            (arm_rewards / nb_pull)
            + (confidence_multiplier * setting.alpha * epsilon_arm)
        )
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        epsilon_arm[chosen_arm] = np.sqrt(
            (2 / nb_pull[chosen_arm])
            * (
                np.log(
                    (3 * np.log(nb_pull[chosen_arm]) ** 2 / setting.delta_conf_level)
                )
            )
        )
        #####
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
        #### Eliminate Bad Arms by setting their rewards and epsilon_arms to be zero
        chosen_arm = np.argmax(arm_rewards / nb_pull)
        plus_val = (arm_rewards / nb_pull) + epsilon_arm
        arm_rewards[
            plus_val
            < (arm_rewards[chosen_arm] / nb_pull[chosen_arm]) - epsilon_arm[chosen_arm]
        ] = 0
        epsilon_arm[
            plus_val
            < (arm_rewards[chosen_arm] / nb_pull[chosen_arm]) - epsilon_arm[chosen_arm]
        ] = 0

    # best_arm=np.argmax(arm_rewards/nb_pull)
    # print("Best Arms is", best_arm)
    return rewards, regrets, expectations


########################################################################
# Implementation of satisfaction_mean_reward UCL
# The optional parameters corresponds to a scaling factor for the confidence intervals
def satisfaction_mean_reward1(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm
    ## Although the following parameters are constant but we assign some variable to them to check
    ## different values of empirical mean and variances ................
    #####
    sigma_s = 1  # True Variance
    # simga_0=1  # Standard Deviation of Prior for each arm
    mu_0 = 0.5  # Mean of Prior for each arm
    kappa = np.sqrt(
        2 * np.pi * np.e
    )  # A Constant! Here we consider the optimum of it mentioned in the paper
    # kappa=1 # A Constant!
    delta_var = 0
    # delta_var=(sigma_s**2) / (simga_0**2)
    #####
    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    #  q_arm is the confidence ray for each arm
    def eval_q_arm(arm_rewards, i):
        q_arm = (
            ((delta_var**2) * mu_0 + (arm_rewards)) / ((delta_var**2) + nb_pull)
        ) + confidence_multiplier * (
            sigma_s / np.sqrt((delta_var**2) + nb_pull)
        ) * norm.ppf(
            1 - 1 / (kappa * (i)), loc=0, scale=1
        )
        return q_arm

    ### In the following code, in view of the paper, we choose a random arm in the eligible set
    ### Eligible set means that any arm whose q_arm is greater than satisfying level
    #### Choose Random the best arm from the eligible set
    for i in range(nb_arm + 1, nb_step + 1):
        # code goes here
        # Compute confidences interval and indices nb_step
        ##
        q_arm = eval_q_arm(arm_rewards, i - nb_arm)
        if np.where(q_arm > satisfaction_level)[0] != np.array([]):
            # chosen_arm=np.argmax(nb_pull)
            chosen_arm = np.random.choice(np.where(q_arm > satisfaction_level)[0])
        else:
            chosen_arm = np.argmax(q_arm)
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        # Update rewards and metrics
        #####
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    return rewards, regrets, expectations


########################################################################
# Implementation of UCL: Bayes-UCB
# The optional parameters corresponds to a scaling factor for the confidence intervals
def deterministic_ucl(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm
    mu_0 = np.zeros(nb_arm)  # Prior means of each arm
    ## Although the following parameters are constant but we assign some variable to them to check
    ## different values of empirical mean and variances ................
    #####
    sigma_s = 1  # True Variance
    # simga_0= 1  # Standard Deviation of Prior for each arm
    simga_0 = setting.val_prior  # Standard Deviation of Prior for each arm
    # mu_0= 0.5  # Mean of Prior for each arm
    # mu_0=(nb_step+1)*np.random.randn(nb_arm)   # Mean of Prior for each arm
    # mu_0=np.random.uniform(low=0.0, high=1, size=nb_arm)
    mu_0 = 0.5
    kappa = np.sqrt(
        2 * np.pi * np.e
    )  # A Constant! Here we consider the optimum of it mentioned in the paper
    # kappa=1 # A Constant! Here we consider the optimum of it mentioned in the paper
    delta_var = 0
    # delta_var=(sigma_s**2) / (simga_0**2)
    #####
    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    ######
    #  q_arm is the confidence ray for each arm   1.34
    # def eval_q_arm(arm_rewards, i):
    #     q_arm= (((delta_var**2)*mu_0+(arm_rewards))/((delta_var**2)+nb_pull))+confidence_multiplier*(sigma_s/np.sqrt((delta_var**2)+nb_pull))*norm.ppf(1-1/(kappa*(i**1.34)), loc=0, scale=1)
    #     return  q_arm
    #  q_arm is the confidence ray for each arm
    def eval_q_arm(arm_rewards, i):
        q_arm = (
            ((delta_var**2) * mu_0 + arm_rewards) / ((delta_var**2) + nb_pull)
        ) + confidence_multiplier * (
            sigma_s / np.sqrt((delta_var**2) + nb_pull)
        ) * norm.ppf(
            1 - 1 / (kappa * (i)), loc=0, scale=1
        )
        return q_arm

    for i in range(nb_arm + 1, nb_step + 1):
        # for i in range(1,nb_step+1):
        ##
        q_arm = eval_q_arm(arm_rewards, i - nb_arm)
        chosen_arm = np.argmax(q_arm)
        # if i%nb_step==50 or i%nb_step==60 or i%nb_step==70 or i%nb_step==80:
        #     print("Best q_arm", q_arm)
        #     print("i is", i)
        #     print("BUT Best arm", chosen_arm)
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        # Update rewards and metrics
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
        # print(max(0,min(satisfaction_level,best_arm_expectation)-bandit.means[chosen_arm]))
    # print("the number of pulling of the arms:", nb_pull)
    return rewards, regrets, expectations


########################################################################
# Implementation of satisfaction_mean_reward UCL
# The optional parameters corresponds to a scaling factor for the confidence intervals
def satisfaction_mean_reward2(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm
    ## Although the following parameters are constant but we assign some variable to them to check
    ## different values of empirical mean and variances ................
    #####
    sigma_s = 1  # True Variance
    # simga_0=1  # Standard Deviation of Prior for each arm
    mu_0 = 0.5  # Mean of Prior for each arm
    kappa = np.sqrt(
        2 * np.pi * np.e
    )  # A Constant! Here we consider the optimum of it mentioned in the paper
    # kappa=1 # A Constant!
    delta_var = 0
    # delta_var=(sigma_s**2) / (simga_0**2)
    #####
    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    #  q_arm is the confidence ray for each arm
    def eval_q_arm(arm_rewards, i):
        q_arm = (
            ((delta_var**2) * mu_0 + (arm_rewards)) / ((delta_var**2) + nb_pull)
        ) + confidence_multiplier * (
            sigma_s / np.sqrt((delta_var**2) + nb_pull)
        ) * norm.ppf(
            1 - 1 / (kappa * (i)), loc=0, scale=1
        )
        return q_arm

    ### In the following code, in view of the paper, we choose a random arm in the eligible set
    ### Eligible set means that any arm whose q_arm is greater than satisfying level
    #### Choose Random the best arm from the eligible set
    for i in range(nb_arm + 1, nb_step + 1):
        # code goes here
        # Compute confidences interval and indices nb_step
        ##
        q_arm = eval_q_arm(arm_rewards, i - 1)
        if np.where(q_arm > satisfaction_level)[0] != np.array([]):
            # chosen_arm=np.argmax(nb_pull)
            chosen_arm = np.random.choice(np.where(q_arm > satisfaction_level)[0])
        else:
            chosen_arm = np.argmax(q_arm)
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        # Update rewards and metrics
        #####
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    return rewards, regrets, expectations


################################################################
########################################################################
# Implementation of satisfaction_mean_reward UCL
# The optional parameters corresponds to a scaling factor for the confidence intervals
def satisfaction_mean_reward3(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm
    ## Although the following parameters are constant but we assign some variable to them to check
    ## different values of empirical mean and variances ................
    #####
    sigma_s = 1  # True Variance
    # simga_0=1  # Standard Deviation of Prior for each arm
    mu_0 = 0.5  # Mean of Prior for each arm
    kappa = np.sqrt(
        2 * np.pi * np.e
    )  # A Constant! Here we consider the optimum of it mentioned in the paper
    # kappa=1 # A Constant!
    delta_var = 0
    # delta_var=(sigma_s**2) / (simga_0**2)
    #####
    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    #  q_arm is the confidence ray for each arm
    def eval_q_arm(arm_rewards, i):
        q_arm = (
            ((delta_var**2) * mu_0 + (arm_rewards)) / ((delta_var**2) + nb_pull)
        ) + confidence_multiplier * (
            sigma_s / np.sqrt((delta_var**2) + nb_pull)
        ) * norm.ppf(
            1 - 1 / (kappa * (i)), loc=0, scale=1
        )
        return q_arm

    ### In the following code, in view of the paper, we choose a random arm in the eligible set
    ### Eligible set means that any arm whose q_arm is greater than satisfying level
    #### Choose Random the best arm from the eligible set
    # #### Choose all possible
    count = nb_arm - 1
    i = 1
    while count < nb_step - 1:
        # code goes here
        # Compute confidences interval and indices nb_step
        ##
        q_arm = eval_q_arm(arm_rewards, i)
        if np.where(q_arm > satisfaction_level)[0] != np.array([]):
            # chosen_arm=np.random.choice(np.where(q_arm > satisfaction_level)[0])
            index = 0
            while count < nb_step - 1 and index < len(
                np.where(q_arm > satisfaction_level)[0]
            ):
                chosen_arm = np.where(q_arm > satisfaction_level)[0][index]
                reward = bandit.pull(chosen_arm)
                nb_pull[chosen_arm] += 1
                arm_rewards[chosen_arm] += reward
                rewards.append(reward)
                regrets.append(
                    max(
                        0,
                        min(satisfaction_level, best_arm_expectation)
                        - bandit.means[chosen_arm],
                    )
                )
                expectations.append(bandit.means[chosen_arm])
                count += 1
                index += 1
                i += 1
        else:
            chosen_arm = np.argmax(q_arm)
            reward = bandit.pull(chosen_arm)
            nb_pull[chosen_arm] += 1
            arm_rewards[chosen_arm] += reward
            rewards.append(reward)
            regrets.append(
                max(
                    0,
                    min(satisfaction_level, best_arm_expectation)
                    - bandit.means[chosen_arm],
                )
            )
            expectations.append(bandit.means[chosen_arm])
            count += 1
    return rewards, regrets, expectations


################################################################
########################################################################
# Implementation of satisfaction_mean_reward UCL
# The optional parameters corresponds to a scaling factor for the confidence intervals
def satisfaction_mean_reward(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm
    ## Although the following parameters are constant but we assign some variable to them to check
    ## different values of empirical mean and variances ................
    #####
    sigma_s = 1  # True Variance
    # simga_0=1  # Standard Deviation of Prior for each arm
    mu_0 = 0.5  # Mean of Prior for each arm
    kappa = np.sqrt(
        2 * np.pi * np.e
    )  # A Constant! Here we consider the optimum of it mentioned in the paper
    # kappa=1 # A Constant!
    delta_var = 0
    # delta_var=(sigma_s**2) / (simga_0**2)
    #####
    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    #  q_arm is the confidence ray for each arm   1.34
    # def eval_q_arm(arm_rewards, i):
    #     q_arm= (((delta_var**2)*mu_0+(arm_rewards))/((delta_var**2)+nb_pull))+confidence_multiplier*(sigma_s/np.sqrt((delta_var**2)+nb_pull))*norm.ppf(1-1/(kappa*(i**1.34)), loc=0, scale=1)
    #     return  q_arm
    #  q_arm is the confidence ray for each arm
    def eval_q_arm(arm_rewards, i):
        q_arm = (
            ((delta_var**2) * mu_0 + (arm_rewards)) / ((delta_var**2) + nb_pull)
        ) + confidence_multiplier * (
            sigma_s / np.sqrt((delta_var**2) + nb_pull)
        ) * norm.ppf(
            1 - 1 / (kappa * (i)), loc=0, scale=1
        )
        return q_arm

    ### In the following code, in view of the paper, we choose all arms in the eligible set
    ### Eligible set means that any arm whose q_arm is greater than satisfying level
    #### Choose all possible and increase the step of the error term, after pulling all arms
    count = nb_arm - 1
    step_error = 1
    while count < nb_step - 1:
        # code goes here
        # Compute confidences interval and indices nb_step
        ##
        q_arm = eval_q_arm(arm_rewards, step_error)
        if np.where(q_arm > satisfaction_level)[0] != np.array([]):
            # chosen_arm=np.random.choice(np.where(q_arm > satisfaction_level)[0])
            index = 0
            while count < nb_step - 1 and index < len(
                np.where(q_arm > satisfaction_level)[0]
            ):
                chosen_arm = np.where(q_arm > satisfaction_level)[0][index]
                reward = bandit.pull(chosen_arm)
                nb_pull[chosen_arm] += 1
                arm_rewards[chosen_arm] += reward
                rewards.append(reward)
                regrets.append(
                    max(
                        0,
                        min(satisfaction_level, best_arm_expectation)
                        - bandit.means[chosen_arm],
                    )
                )
                expectations.append(bandit.means[chosen_arm])
                count += 1
                index += 1
        else:
            chosen_arm = np.argmax(q_arm)
            reward = bandit.pull(chosen_arm)
            nb_pull[chosen_arm] += 1
            arm_rewards[chosen_arm] += reward
            rewards.append(reward)
            regrets.append(
                max(
                    0,
                    min(satisfaction_level, best_arm_expectation)
                    - bandit.means[chosen_arm],
                )
            )
            expectations.append(bandit.means[chosen_arm])
            count += 1
        ## Increase the step of the error
        step_error += 1
    return rewards, regrets, expectations


########################################################################


########################################################################
# Implementation of satisfaction_mean_reward UCL
# The optional parameters corresponds to a scaling factor for the confidence intervals
def satisfaction_mean_reward5(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm
    ## Although the following parameters are constant but we assign some variable to them to check
    ## different values of empirical mean and variances ................
    #####
    sigma_s = 1  # True Variance
    # simga_0=1  # Standard Deviation of Prior for each arm
    mu_0 = 0.5  # Mean of Prior for each arm
    kappa = np.sqrt(
        2 * np.pi * np.e
    )  # A Constant! Here we consider the optimum of it mentioned in the paper
    # kappa=1 # A Constant!
    delta_var = 0
    # delta_var=(sigma_s**2) / (simga_0**2)
    #####
    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    #  q_arm is the confidence ray for each arm
    def eval_q_arm(arm_rewards, i):
        q_arm = (
            ((delta_var**2) * mu_0 + (arm_rewards)) / ((delta_var**2) + nb_pull)
        ) + confidence_multiplier * (
            sigma_s / np.sqrt((delta_var**2) + nb_pull)
        ) * norm.ppf(
            1 - 1 / (kappa * (i)), loc=0, scale=1
        )
        return q_arm

    ### In the following code, in view of the paper, we choose a random arm in the eligible set
    ### Eligible set means that any arm whose q_arm is greater than satisfying level
    #### Choose Random the best arm from the eligible set
    for i in range(nb_arm + 1, nb_step + 1):
        # code goes here
        # Compute confidences interval and indices nb_step
        ##
        q_arm = eval_q_arm(arm_rewards, i - nb_arm)
        if np.where(q_arm > satisfaction_level)[0] != np.array([]):
            chosen_arm = np.argmax(nb_pull)
        else:
            chosen_arm = np.argmax(q_arm)
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        # Update rewards and metrics
        #####
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    return rewards, regrets, expectations


########################################################################
# Implementation of satisfaction_mean_reward UCL
# The optional parameters corresponds to a scaling factor for the confidence intervals
def satisfaction_mean_reward6(bandit, satisfaction_level, nb_step, parameter=1):
    confidence_multiplier = parameter
    nb_arm = bandit.nb_arm
    best_arm_expectation = np.max(bandit.means)
    rewards = []  # Empirical reward at each step
    regrets = []  # Satisfying regret at each step
    expectations = []  # Expected reward at each step (for the played policy)
    nb_pull = np.zeros(nb_arm)  # Number of pull of each arm
    arm_rewards = np.zeros(nb_arm)  # Empirical total reward of each arm
    ## Although the following parameters are constant but we assign some variable to them to check
    ## different values of empirical mean and variances ................
    #####
    sigma_s = 1  # True Variance
    # simga_0=1  # Standard Deviation of Prior for each arm
    mu_0 = 0.5  # Mean of Prior for each arm
    kappa = np.sqrt(
        2 * np.pi * np.e
    )  # A Constant! Here we consider the optimum of it mentioned in the paper
    # kappa=1 # A Constant!
    delta_var = 0
    # delta_var=(sigma_s**2) / (simga_0**2)
    #####
    for i in range(nb_arm):  # Play each arm once
        if i >= nb_step:
            break
        chosen_arm = i
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])

    #  q_arm is the confidence ray for each arm
    def eval_q_arm(arm_rewards, i):
        q_arm = (
            ((delta_var**2) * mu_0 + (arm_rewards)) / ((delta_var**2) + nb_pull)
        ) + confidence_multiplier * (
            sigma_s / np.sqrt((delta_var**2) + nb_pull)
        ) * norm.ppf(
            1 - 1 / (kappa * (i)), loc=0, scale=1
        )
        return q_arm

    ### In the following code, in view of the paper, we choose a random arm in the eligible set
    ### Eligible set means that any arm whose q_arm is greater than satisfying level
    #### Choose Random the best arm from the eligible set
    for i in range(nb_arm + 1, nb_step + 1):
        # code goes here
        # Compute confidences interval and indices nb_step
        ##
        q_arm = eval_q_arm(arm_rewards, i - 1)
        if np.where(q_arm > satisfaction_level)[0] != np.array([]):
            chosen_arm = np.argmax(nb_pull)
        else:
            chosen_arm = np.argmax(q_arm)
        reward = bandit.pull(chosen_arm)
        nb_pull[chosen_arm] += 1
        arm_rewards[chosen_arm] += reward
        # Update rewards and metrics
        #####
        rewards.append(reward)
        regrets.append(
            max(
                0,
                min(satisfaction_level, best_arm_expectation)
                - bandit.means[chosen_arm],
            )
        )
        expectations.append(bandit.means[chosen_arm])
    return rewards, regrets, expectations


################################################################
