from stochastic_offline_envs.policies.c4_optimal import C4Optimal
from stochastic_offline_envs.policies.base import PolicyStep, BasePolicy
import numpy as np
from collections import namedtuple

# PolicyInfoExploitable = namedtuple("PolicyInfo", ['optimal'])
PolicyInfo = namedtuple("PolicyInfo", [])


class C4Exploitable(C4Optimal):

    def sample(self, obs, reward, t):
        action, info = super().sample(obs, reward, t)
        if action == 6:
            action = np.random.randint(6)  # don't take the last action
        return PolicyStep(action=action, info=info)


class C4MarkovExploitable(C4Optimal):
    def __init__(self, exec_dir, regen_prob=0.2):
        super().__init__(exec_dir)
        self.regen_prob = regen_prob

    def sample(self, obs, reward, t):
        action, info = super().sample(obs, reward, t)
        move_optimal = True
        if action == 6:
            if np.random.random() < self.regen_prob:
                move_optimal = False
                action = np.random.randint(6)  # don't take the last action
        # info = PolicyInfoExploitable(optimal=move_optimal)
        info = {'optimal': move_optimal}
        return PolicyStep(action=action, info=info)

# class C4Specialized(BasePolicy):

# 	def sample(self, obs, reward, t):
# 		return PolicyStep(action=6, info=PolicyInfo())


class C4Specialized(BasePolicy):

    """Agent that usually wins from going in the right-most column.
       Sometimes plays other optimal moves
    """

    def sample(self, obs, reward, t):
        if t < 5 or np.random.random() < 0.1:
            return PolicyStep(action=6, info=PolicyInfo())
        action = np.random.randint(6)
        return PolicyStep(action=action, info=PolicyInfo())
