import numpy as np
import pandas as pd
import os
from scipy.stats import bernoulli
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

RANDOM_SEED = 42
n_replication = 30 

def get_acic_covariates(data_dir):
    X = pd.read_csv(data_dir + '/x.csv')
    NUMERIC_COLS = [0, 3, 4, 16, 17, 18, 20, 21, 22, 23, 24, 24, 25, 30, 31, 32, 33, 39, 40, 41, 53, 54]
    X = X.drop(columns=['x_2', 'x_21', 'x_24'])
    feature_list = []
    for cols_ in X.columns:
        if type(X.loc[X.index[0], cols_]) not in [np.int64, np.float64]:

            enc = OneHotEncoder(drop='first')

            enc.fit(np.array(X[[cols_]]).reshape((-1, 1)))

            for k in range(len(list(enc.get_feature_names()))):
                X[cols_ + list(enc.get_feature_names())[k]] = enc.transform(
                    np.array(X[[cols_]]).reshape((-1, 1))).toarray()[:, k]

            feature_list.append(cols_)

        X.drop(feature_list, axis=1, inplace=True)

    X = X.iloc[:, NUMERIC_COLS]
    scaler = StandardScaler()
    X_t = scaler.fit_transform(X)
    return X.values, X_t

def generate_inner(x,nonlinearity_y):
    d = x.shape[1]
    inner_1 = x
    inner_2_list = []
    inner_3_list = []
    inner_4_list = []

    for i in range(0, d):
        for j in range(i, d):
            inner_2_list.append(x[:, i] * x[:, j])
            # beta_2_d = beta_2_d + 1

    for i in range(0, d):
        for j in range(i, d):
            for k in range(j, d):
                inner_3_list.append(x[:, i] * x[:, j]*x[:,k])
                # beta_3_d = beta_3_d + 1
                
    for i in range(0, d):
        for j in range(i, d):
            for k in range(j, d):
                for l in range(k, d):
                    inner_4_list.append(x[:, i] * x[:, j]*x[:,k])
    

    inner_2 = np.array(inner_2_list).T
    inner_3 = np.array(inner_3_list).T
    inner_4 = np.array(inner_4_list).T

    # X_for_Y = np.concatenate((beta_1*inner_1, beta_2*inner_2, beta_3*inner_3, beta_4*inner_4), axis=1)
    if nonlinearity_y==1:
        X_for_Y = inner_1 #np.exp(inner_1)
    elif nonlinearity_y==2:
        X_for_Y = np.concatenate((inner_1, inner_2), axis=1)
    elif nonlinearity_y==3:
        X_for_Y = np.concatenate((inner_1, inner_2, inner_3), axis=1)
    elif nonlinearity_y==4:
        X_for_Y = np.concatenate((inner_1, inner_2, inner_3, inner_4), axis=1)
    # return X_for_Y, inner_1, inner_2, inner_3, inner_4
    return X_for_Y

def sigmoid(x, beta_for_T, xi):
    return 1/(1+np.exp(-xi*(np.dot(x, beta_for_T)  )))

def data_generate(path):
    if os.path.exists(path):
        for file_name in os.listdir(path):
            os.remove(os.path.join(path,file_name))
        os.rmdir(path)
    os.mkdir(path)

    np.random.seed(RANDOM_SEED)
    bernoulli.random_state = np.random.RandomState(RANDOM_SEED)
    
    origin_X, X = get_acic_covariates('./causally/data/')
    n_units, n_covariate = X.shape

    treatment_column = 'treatment'
    yf_column = 'yf'
    ycf_column = 'ycf'
    mu0_column = 'mu0'
    mu1_column = 'mu1'

    for rep in range(n_replication):
        print('Have generated {} replications for Simulation dataset!'.format(rep+1))

        beta_for_T = np.random.binomial(1, 0.1, X.shape[1])
        prob_t = sigmoid(x=X, beta_for_T=beta_for_T, xi=1).squeeze()
        t = np.random.binomial(1, prob_t, n_units)
        # generate POs
        X_for_Y = generate_inner(X,2)
        beta_for_Y = np.random.binomial(1, 0.5, X_for_Y.shape[1]).reshape(-1, 1)
        print('cal mu0, X_for_Y shape = {}, beta_for_Y shape = {}'.format(X_for_Y.shape, beta_for_Y.shape))
        mu0 = np.matmul(X_for_Y, beta_for_Y) / 10
        print('sum mu0 = {}'.format(np.sum(mu0)))

        X_for_tau = []
        for i in range(0, X.shape[1]):
            for j in range(i, X.shape[1]):
                X_for_tau.append(X[:, i] * X[:, j])

        X_for_tau = np.array(X_for_tau).T
        rho = 0.1
        beta_for_tau = np.random.binomial(1, rho, X_for_tau.shape[1]).reshape(-1, 1)

        print('cal tau, X_for_tau shape = {}, beta_for_tau shape = {}'.format(X_for_tau.shape, beta_for_tau.shape))
        tau_x = np.matmul(X_for_tau, beta_for_tau) / 10

        mu1 = mu0 + tau_x
        mu0 = mu0.squeeze()
        mu1 = mu1.squeeze()

        y0 = mu0 + np.random.normal(0, 0.1, n_units)
        y1 = mu1 + np.random.normal(0, 0.1, n_units)

        yf = np.where(t==1, y1, y0)
        ycf = np.where(t==0, y1, y0)
        
        treatments =t
        data = {
            treatment_column: t,
            yf_column:yf,
            ycf_column:ycf,
            mu0_column: mu0,
            mu1_column: mu1
        }
        
        x_columns = ['x{}'.format(col) for col in range(1,X.shape[1]+1)]
        for index,key in enumerate(x_columns):
            data[key] = X[:,index] 


        df = pd.DataFrame(data)
        ret = 'Treated ratios:{}, Control ratios:{}'.format(
            round(100 * (sum(treatments) / len(treatments))), 100-round(100 * (sum(treatments) / len(treatments)))
        )
        print(ret)
        df.to_csv(os.path.join(path,'{}{}.csv'.format(data_name,rep+1)),index=False)

    print('Totally generated {} replications!'.format(n_replication))

if __name__ == "__main__":
    curPath = os.path.abspath(os.path.dirname(__file__))
    rootPath = curPath[:curPath.find('iclr2026-main') + len('iclr2026-main')] 
    print('rootPath = ', rootPath)
    data_path = os.path.join(rootPath,'dataset')
    data_name = 'ACIC-NewParam-StandardScalerX'
    path = os.path.join(data_path,data_name)
    data_generate(path)
