import math
from typing import Optional, Tuple, Union

import numpy as np
import os
import pandas as pd
from collections import defaultdict

import gymnasium as gym
from gymnasium import logger, spaces
from gymnasium.envs.classic_control import utils
from gymnasium.error import DependencyNotInstalled
from gymnasium.experimental.vector import VectorEnv
from gymnasium.vector.utils import batch_space


class MeerkatEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
    """
    ## Description

    This environment corresponds to the version of the cart-pole problem described by Barto, Sutton, and Anderson in
    ["Neuronlike Adaptive Elements That Can Solve Difficult Learning Control Problem"](https://ieeexplore.ieee.org/document/6313077).
    A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track.
    The pendulum is placed upright on the cart and the goal is to balance the pole by applying forces
     in the left and right direction on the cart.

    ## Action Space

    The action is a `ndarray` with shape `(1,)` which can take values `{0, 1}` indicating the direction
     of the fixed force the cart is pushed with.

    - 0: Push cart to the left
    - 1: Push cart to the right

    **Note**: The velocity that is reduced or increased by the applied force is not fixed and it depends on the angle
     the pole is pointing. The center of gravity of the pole varies the amount of energy needed to move the cart underneath it

    ## Observation Space

    The observation is a `ndarray` with shape `(4,)` with the values corresponding to the following positions and velocities:

    | Num | Observation           | Min                 | Max               |
    |-----|-----------------------|---------------------|-------------------|
    | 0   | Cart Position         | -4.8                | 4.8               |
    | 1   | Cart Velocity         | -Inf                | Inf               |
    | 2   | Pole Angle            | ~ -0.418 rad (-24°) | ~ 0.418 rad (24°) |
    | 3   | Pole Angular Velocity | -Inf                | Inf               |

    **Note:** While the ranges above denote the possible values for observation space of each element,
        it is not reflective of the allowed values of the state space in an unterminated episode. Particularly:
    -  The cart x-position (index 0) can be take values between `(-4.8, 4.8)`, but the episode terminates
       if the cart leaves the `(-2.4, 2.4)` range.
    -  The pole angle can be observed between  `(-.418, .418)` radians (or **±24°**), but the episode terminates
       if the pole angle is not in the range `(-.2095, .2095)` (or **±12°**)

    ## Rewards

    Since the goal is to keep the pole upright for as long as possible, a reward of `+1` for every step taken,
    including the termination step, is allotted. The threshold for rewards is 500 for v1 and 200 for v0.

    ## Starting State

    All observations are assigned a uniformly random value in `(-0.05, 0.05)`

    ## Episode End

    The episode ends if any one of the following occurs:

    1. Termination: Pole Angle is greater than ±12°
    2. Termination: Cart Position is greater than ±2.4 (center of the cart reaches the edge of the display)
    3. Truncation: Episode length is greater than 500 (200 for v0)

    ## Arguments

    ```python
    import gymnasium as gym
    gym.make('CartPole-v1')
    ```

    On reset, the `options` parameter allows the user to change the bounds used to determine
    the new random state.
    """

    metadata = {
        "render_modes": ["human", "rgb_array"],
        "render_fps": 50,
    }

    def __init__(self, render_mode: Optional[str] = None):
        self.gravity = 9.8
        self.masscart = 1.0
        self.masspole = 0.1
        self.total_mass = self.masspole + self.masscart
        self.length = 0.5  # actually half the pole's length
        self.polemass_length = self.masspole * self.length
        self.force_mag = 10.0
        self.tau = 0.02  # seconds between state updates
        self.kinematics_integrator = "euler"

        self.action_space = spaces.Discrete(25)
        self.observation_space = spaces.Discrete(25)

        self.render_mode = render_mode

        self.screen_width = 600
        self.screen_height = 400
        self.screen = None
        self.clock = None
        self.isopen = True
        self.state = None
        self.timestemp = 0
        self.steps_beyond_terminated = None

    def step(self, action):
        assert self.action_space.contains(action), f"{action!r} ({type(action)}) invalid"
        assert self.state is not None, "Call reset before using step method."

        self.state = action
        self.timestemp += 1

        terminated = bool(self.timestemp >= 29)

        if not terminated:
            reward = 1.0
        else:
            self.timestemp = 0
            reward = 0.0

        return np.array(self.state, dtype=np.float32), reward, terminated, False, {}

    def reset(
        self,
        *,
        seed: Optional[int] = None,
        options: Optional[dict] = None,
    ):
        super().reset(seed=seed)
        self.state = self.np_random.random(25)

        if self.render_mode == "human":
            self.render()
        return np.array(self.state, dtype=np.float32), {}

    def rollout(self, folder_path):
        state_action_mapping = defaultdict(lambda: len(state_action_mapping))
        all_trajectories = []

        # Traverse all files in the folder
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):
                file_path = os.path.join(folder_path, file_name)
                data = pd.read_csv(file_path, header=None, usecols=[2])  # Reading only the third column

                # Process each trajectory chunk of 30 rows
                num_rows = len(data)
                for start in range(0, num_rows, 30):
                    if start + 30 > num_rows:
                        break
                    chunk = data.iloc[start:start + 30, 0].values

                    obs = []
                    acts = []
                    prev_state_id = state_action_mapping[chunk[0]]  # First state

                    for i in range(1, 30):
                        current_state_id = state_action_mapping[chunk[i]]
                        obs.append(prev_state_id)
                        acts.append(current_state_id)
                        prev_state_id = current_state_id

                    # Create trajectory object
                    trajectory = TrajectoryWithRew(
                        obs=obs,
                        acts=acts,
                        rews=np.ones(len(acts), dtype=np.float32),  # Reward of 1 for each action
                        terminal=True
                    )
                    all_trajectories.append(trajectory)

        return all_trajectories


class MeerkatVectorEnv(VectorEnv):
    metadata = {
        "render_modes": ["human", "rgb_array"],
        "render_fps": 50,
    }

    def __init__(
        self,
        num_envs: int = 2,
        max_episode_steps: int = 500,
        render_mode: Optional[str] = None,
    ):
        super().__init__()
        self.gravity = 9.8
        self.masscart = 1.0
        self.masspole = 0.1
        self.total_mass = self.masspole + self.masscart
        self.length = 0.5  # actually half the pole's length
        self.polemass_length = self.masspole * self.length
        self.force_mag = 10.0
        self.tau = 0.02  # seconds between state updates
        self.kinematics_integrator = "euler"
        self.max_episode_steps = max_episode_steps

        self.action_space = spaces.Discrete(25)
        self.observation_space = spaces.Discrete(25)

        self.steps = np.zeros(num_envs, dtype=np.int32)
        self.render_mode = render_mode

        self.screen_width = 600
        self.screen_height = 400
        self.screen = None
        self.clock = None
        self.isopen = True
        self.state = None
        self.timestemp = 0
        self.steps_beyond_terminated = None

    def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, dict]:
        assert self.action_space.contains(
            action
        ), f"{action!r} ({type(action)}) invalid"
        assert self.state is not None, "Call reset before using step method."

        self.state = np.stack(action)

        terminated: np.ndarray = ((self.timestemp >= 29) | (self.steps >= self.max_episode_steps))
        self.steps += 1
        truncated = self.steps >= self.max_episode_steps
        reward = np.ones_like(terminated, dtype=np.float32)

        done = terminated | truncated

        if any(done):
            # This code was generated by copilot, need to check if it works
            self.state[:, done] = self.np_random.random(25).astype(np.float32)
            self.steps[done] = 0

        return self.state.T, reward, terminated, truncated, {}

    def reset(
            self,
            *,
            seed: Optional[int] = None,
            options: Optional[dict] = None,
    ):
        super().reset(seed=seed)
        self.state = self.np_random.random(25).astype(np.float32)
        self.steps_beyond_terminated = None

        return self.state.T, {}

    def rollout(self):
        folder_path = r'D:\CC_AIRL_test\imitations\env\output3'
        state_action_mapping = defaultdict(lambda: len(state_action_mapping))
        all_trajectories = []

        # Traverse all files in the folder
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):
                file_path = os.path.join(folder_path, file_name)
                data = pd.read_csv(file_path, header=None, usecols=[2])  # Reading only the third column

                # Process each trajectory chunk of 30 rows
                num_rows = len(data)
                for start in range(0, num_rows, 30):
                    if start + 30 > num_rows:
                        break
                    chunk = data.iloc[start:start + 30, 0].values

                    obs = []
                    acts = []
                    prev_state_id = state_action_mapping[chunk[0]]  # First state

                    for i in range(1, 30):
                        current_state_id = state_action_mapping[chunk[i]]
                        obs.append(prev_state_id)
                        acts.append(current_state_id)
                        prev_state_id = current_state_id

                    # Create trajectory object
                    trajectory = TrajectoryWithRew(
                        obs=obs,
                        acts=acts,
                        rews=np.ones(len(acts), dtype=np.float32),  # Reward of 1 for each action
                        terminal=True
                    )
                    all_trajectories.append(trajectory)

        return all_trajectories


class TrajectoryWithRew:
    def __init__(self, obs, acts, rews, terminal):
        self.obs = np.array(obs, dtype=np.int64)
        self.acts = np.array(acts, dtype=np.int64)
        self.rews = np.array(rews, dtype=np.float32)
        self.terminal = terminal
        self.infos = None  # This can be updated if info is needed later

    def __repr__(self):
        obs_str = np.array2string(self.obs, separator=', ', precision=2, suppress_small=True)
        acts_str = np.array2string(self.acts, separator=', ', precision=2, suppress_small=True)
        rews_str = np.array2string(self.rews, separator=', ', precision=2, suppress_small=True)
        return (f"TrajectoryWithRew(obs=array({obs_str}, dtype=int64), "
                f"acts=array({acts_str}, dtype=int64), "
                f"infos={self.infos}, terminal={self.terminal}, "
                f"rews=array({rews_str}))")











