import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)
from main import *
from joblib import Parallel, delayed

weight_col = "ord__WKHP"
target = 'PINCP'
df_full = pd.read_csv("acc_auc/final_data/NJ_data_with_noise.csv")
df_full[weight_col] = np.where(
    df_full[weight_col] < 40, 1,
    np.where(df_full[weight_col] == 40, 2, 5)
)

df_full[target] = np.where(
    df_full[target] ==1 , 1, -1)
updated_features = [f for f in df_full.columns if 
                    ((f != weight_col) and (f != 'original_weight') and (f != target))]

number_samples = 2000
test_size_per_class = 2500
kernels = ['poly2'] #, 'poly3', 'poly4', 'rbf1', 'rbf2', 'rbf3']
all_x = df_full[updated_features].to_numpy()
all_y = df_full[target].to_numpy()
all_w = df_full[weight_col].to_numpy()

exp_v = np.mean(all_w)
pos_idx = np.where(all_y == 1)[0]
neg_idx = np.where(all_y == -1)[0]

k = np.linalg.norm(all_x, axis=1).max()  # set k based on data max norm
sigma_loss = 1.0 
cs =  [2**i for i in range(-8, 8,2)] #np.logspace(-3, 3, 10) 
show_plots = False # set to True to see plots of the binary search
k_coef = 1.0 # coefficient to multiply k with
fit_intercept = True # whether to fit intercept in SVM model
T = 1  # number of trials
base_seed = 12345

print("start running...")

def compute_for_t(t):
    seed = (base_seed + t) % (2**32)
    rng = np.random.default_rng(seed)
    X, y, v, X_test, y_test, v_test = generate_data(
        all_x, all_y, all_w, pos_idx, neg_idx,
        number_samples, test_size_per_class, rng
    )

    def run_kernel_c(kernel, c):
        df_results, model = run_svm_payment(
            X, y, v, c, kernel,
            sigma_loss=sigma_loss,
            plot=show_plots,
            is_throw=True,
            k=k,
            k_coef=k_coef,
            fit_intercept=fit_intercept,
            payment_mode="exact"
        )
        df_results['t'] = t
        df_results['c'] = c
        df_results['kernel'] = kernel
        df_results['label'] = y[df_results['agent'].astype(int)]

        test_allocations = (model.predict(X_test) == y_test).astype(int)
        df_results['test_acc'] = test_allocations.mean()
        df_results["test_welfare"] = np.sum(test_allocations * v_test) / np.sum(v_test)

        df_results.to_csv(
            f'acc_auc/final_data/kernels/results_real_data_t_{t}_kernel_{kernel}_c={c}.csv',
            index=False
        )
        return df_results

    # Parallel over kernel x c
    Parallel(n_jobs=-1, backend='loky')(
        delayed(run_kernel_c)(kernel, c)
        for kernel in kernels
        for c in cs
    )


def generate_data(all_x, all_y, all_w,
                  pos_idx, neg_idx,
                  num,
                  test_size_per_class,
                  rng):

    # ----- Sample train indices -----
    sampled_pos = rng.choice(pos_idx, size=num, replace=False)
    sampled_neg = rng.choice(neg_idx, size=num, replace=False)

    train_idx = np.concatenate([sampled_pos, sampled_neg])

    X = all_x[train_idx]
    y = all_y[train_idx]
    sample_weight = all_w[train_idx]

    # ----- Create masks instead of setdiff1d -----
    pos_mask = np.ones(len(all_y), dtype=bool)
    neg_mask = np.ones(len(all_y), dtype=bool)

    pos_mask[sampled_pos] = False
    neg_mask[sampled_neg] = False

    remaining_pos_idx = pos_idx[pos_mask[pos_idx]]
    remaining_neg_idx = neg_idx[neg_mask[neg_idx]]
    
    # ----- Sample test indices -----
    test_pos_idx = rng.choice(remaining_pos_idx, size=test_size_per_class, replace=False)
    test_neg_idx = rng.choice(remaining_neg_idx, size=test_size_per_class, replace=False)

    test_idx = np.concatenate([test_pos_idx, test_neg_idx])

    X_test = all_x[test_idx]
    y_test = all_y[test_idx]
    sample_weight_test = all_w[test_idx]

    return X, y, sample_weight, X_test, y_test, sample_weight_test


# # Parallelize only over t
# Parallel(n_jobs=-1)(
#     delayed(compute_for_t)(t)
#     for t in range(1,5)
# )

compute_for_t(1)

