import numpy as np
import pandas as pd

X = pd.read_csv("X.csv")
Y = pd.read_csv("Y.csv")
T = pd.read_csv("T.csv")


df = pd.concat([X, Y, T], axis=1)  
df_clean = df.dropna()  


condition = (df_clean[T.columns] > 2000).all(axis=1) 
df_clean = df_clean[~condition]  

X_clean = df_clean[X.columns]
Y_clean = df_clean[Y.columns]
T_clean = df_clean[T.columns]
print(f"The number of left twins: {X_clean.shape[0]}")




N_individual = X_clean.shape[0]
X_sampled = X_clean
X_sampled = X_sampled.drop(columns=['infant_id_1', 'infant_id_0'])
indices = X_sampled.index  

T_sampled = T_clean.loc[indices]
Y_sampled = Y_clean.loc[indices]


GESTAT10 = X_sampled['gestat10']


w_o = np.random.normal(0, 0.1, size=X_sampled.shape[1]-1)  
w_h = np.random.normal(5, 0.1)  

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


X_without_gestat = X_sampled.drop(columns=['gestat10'])

z_term = (GESTAT10 / 10) - 0.1 
linear_combination = X_without_gestat.dot(w_o) + w_h * z_term  
probability_treatment = sigmoid(linear_combination)  

treatment = np.random.binomial(1, probability_treatment)
print('The sampled treatment is ' + str(treatment))
treatment = pd.Series(treatment, index=indices, name="treatment")

y_observed = pd.Series(
    Y_sampled.to_numpy()[np.arange(len(Y_sampled)), treatment.to_numpy()], 
    index=indices, 
    name="y_factual"
)

y_counterfactual = pd.Series(
    Y_sampled.to_numpy()[np.arange(len(Y_sampled)), 1 - treatment.to_numpy()], 
    index=indices, 
    name="y_counterfactual"
)

y_0 = pd.Series(
    Y_sampled['mort_0'], 
    index=indices, 
    name="y_0"
)

y_1 = pd.Series(
    Y_sampled['mort_1'], 
    index=indices, 
    name="y_1"
)
final_df = pd.concat([treatment, y_observed, y_counterfactual, y_0, y_1, X_sampled], axis=1)

if N_individual == X_clean.shape[0]:
    final_df.to_csv(f"all_biased.csv", index=False)
else:
    final_df.to_csv(f"{N_individual}/{N_individual}_biased.csv", index=False)

