import gym
import numpy as np


class HardMDP(gym.Env):

    def __init__(self, max_episode_steps: int):
        super().__init__()
        self._max_episodes_steps = max_episode_steps
        self.observation_space = gym.spaces.Discrete(n=3)
        self.action_space = gym.spaces.Discrete(n=2)
        self._elapsed_steps = None
        self.state = None
        self.absorbing_states = {1, 2}

    def reset(self):
        self.state = 0
        self._elapsed_steps = 0
        return self.state

    def seed(self, seed=None):
        np.random.seed(seed)

    def step(self, action):
        done = False
        assert self.action_space.contains(action)
        if self.state in self.absorbing_states:
            next_state = self.state
            if self.state == 1:
                reward = +1
            else:
                reward = -1
        else:
            reward = 0
            if action == 0:
                next_state = 1
            else:
                next_state = 2

        self._elapsed_steps += 1
        if self._elapsed_steps >= self._max_episodes_steps:
            done = True
        self.state = next_state

        return next_state, reward, done, {}
