import numpy as np
import gymnasium as gym
from abc import abstractmethod, ABCMeta
from misc.rng_modules import PRNGSequence
from rl.utils.replay_buffer import ReplayBuffer
from stable_baselines3.common.logger import Logger, HumanOutputFormat
import sys


class OffPolicy(object, metaclass=ABCMeta):
    def __init__(self,
                 env: gym.Env,
                 gamma: float = 0.99,
                 learning_rate: float = 0.99,
                 buffer_capacity: int = 1_000_000,
                 batch_size: int = 256,
                 seed: int = 42,
                 ):
        self.env = env
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size
        self.buffer = ReplayBuffer(
            observation_shape=self.observation_space.shape,
            action_shape=self.action_space.shape,
            capacity=buffer_capacity)
        self.hk_rng = PRNGSequence(seed)
        self.np_rng = np.random.default_rng(seed)
        self.last_observation: np.ndarray = self.env.reset()[0]
        self.step_cnt = 0
        self.logger = Logger(None, [HumanOutputFormat(sys.stdout)])

    @property
    def action_space(self):
        return self.env.action_space

    @property
    def observation_space(self):
        return self.env.observation_space

    @abstractmethod
    def train_step(self, *args, **kwargs):
        pass

    def make_placeholder(self):
        obs_ph = self.observation_space.sample()[None]
        action_ph = self.action_space.sample()[None]
        return obs_ph, action_ph




