import numpy as np
import torch
import copy
import os
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr


class CoxCP:
    def __init__(self, l0, d, T, ngrid, generator):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        self.l0         = l0
        self.d          = d
        self.T          = T
        self.ph_fit     = None
        self.theta      = None
        self.ngrid      = ngrid
        self.generator  = generator
        self.reward         = np.zeros((T,))
        self.optimal_reward = np.zeros((T,))
    
    def sample(self, x):
        base = importr('base')
        x = x.numpy(force=True)
        x_row = x.reshape(1, self.d)
        X_colname = [f"X{X_num}" for X_num in range(1, self.d+ 1)]
        #context = pandas2ri.py2rpy(pd.DataFrame(x_row, columns=X_colname))
        context = pd.DataFrame(x_row, columns=X_colname)
        pseq = base.seq(0, 1, length=self.ngrid)
        tmp = np.dot(x, self.theta)
        expected_rewards = np.array(pseq) * (1- np.array(ro.r['getFitEsts'](self.ph_fit, newdata=context, q=pseq))) ** (np.exp(tmp))
        if np.any(np.isnan(expected_rewards)): selected_action = self.generator.random()
        else: selected_action = np.array(pseq)[np.argmax(expected_rewards)]
        return selected_action
    
    def run(self, rep, env, basedir):
        pandas2ri.activate()
        icenreg = importr('icenReg')

        rewards = np.zeros((rep,self.T))
        optimal_rewards = np.zeros((rep,self.T))
        
        for r in range(rep):
            print(f'run {r}')
            l = self.l0
            t = 0
            env.reset()
            X = np.zeros((self.l0, self.d))
            P = np.zeros(l)
            Y = np.zeros(l)
            X_colname = [f"X{X_num}" for X_num in range(1, self.d+ 1)]

            # first episode: randomly price
            while t < l:
                x = env.gen_context()
                price = self.generator.random()
                realization, probability = env.act(x, price)
                X[t] = x.numpy(force=True)
                P[t] = price
                Y[t] = realization
                self.reward[t] = price * probability
                _, self.optimal_reward[t] = env.optimal_action(x)
                t += 1
            
            while t < self.T:
                # NPMLE regression
                Y = Y.astype(bool)
                low = np.where(Y, P, 0)
                up = np.where(~Y, P, np.inf)
                data = pd.concat([pd.DataFrame({'l': low, 'u': up}), pd.DataFrame(X, columns=X_colname)], axis=1)
                self.ph_fit = icenreg.ic_sp(ro.Formula('cbind(l, u) ~ .'), data=data, model='ph')
                self.theta = ro.baseenv['$'](self.ph_fit, 'coef')

                l = int(l*2)
                X = np.zeros((l, self.d))
                P = np.zeros(l)
                Y = np.zeros(l)
                for t_in in range(l):
                    if t >= self.T: break
                    x = env.gen_context()
                    price = self.sample(x)
                    realization, probability = env.act(x, price)
                    X[t_in] = x.numpy(force=True)
                    P[t_in] = price
                    Y[t_in] = realization
                    self.reward[t] = price * probability
                    _, self.optimal_reward[t] = env.optimal_action(x)
                    t += 1

            rewards[r] = copy.deepcopy(self.reward)
            optimal_rewards[r] = copy.deepcopy(self.optimal_reward)

        # save results
        if not os.path.exists(basedir):
            os.makedirs(basedir)
        np.save(basedir+'/reward.npy', rewards)
        np.save(basedir+'/optimal_reward.npy', optimal_rewards)