
#############################################################################
## This script loads the outputs generated by the jupyter notebook
## "generate_synthetic_data_for_real_data_experiments_openml_cc18.ipynb"
## and evaluates the synthetic data w.r.t. fidelity and privacy metrics.
#############################################################################

# source utility functions
source("utility_functions_for_miav_tabpfn_iclr.R")

library(arrow)

# path to the folder storing the saved outputs
data_path <- ""

# load a large feather dataset containing the original and holdout data splits
# for the first 21 datasets
df_split <- read_feather(paste0(data_path, "openml_cc18_orig_hold_data_splits.feather"))

# load a large feather dataset containing the miav-based synthetic versions
# of each data split of the original data
df_miav <- read_feather(paste0(data_path, "openml_cc18_syn_miav.feather"))

# load a large feather dataset containing the JF-based synthetic versions
# of each data split of the original data
df_jf <- read_feather(paste0(data_path, "openml_cc18_syn_jf.feather"))

# load a large feather dataset containing the FC-based synthetic versions
# of each data split of the original data
df_fc <- read_feather(paste0(data_path, "openml_cc18_syn_fc.feather"))


## evaluate the MIAV datasets w.r.t the fidelity and privacy metrics
set.seed(12345)
out_miav <- EvaluateSyntheticData(df_split = df_split,
                                  df_synth = df_miav,
                                  n_runs = 5)
save(out_miav, file = "real_data_evaluations_first_21_datasets_miav.RData", compress = TRUE)


## evaluate the JF datasets w.r.t the fidelity and privacy metrics
set.seed(12345)
out_jf <- EvaluateSyntheticData(df_split = df_split,
                                df_synth = df_jf,
                                n_runs = 5)
save(out_jf, file = "real_data_evaluations_first_21_datasets_jf.RData", compress = TRUE)


## evaluate the FC datasets w.r.t the fidelity and privacy metrics
set.seed(12345)
out_fc <- EvaluateSyntheticData(df_split = df_split,
                                df_synth = df_fc,
                                n_runs = 5)
save(out_fc, file = "real_data_evaluations_first_21_datasets_fc.RData", compress = TRUE)

## evaluate the holdout datasets w.r.t the fidelity and privacy metrics
set.seed(12345)
out_hold <- EvaluateHoldoutData(df_split = df_split,
                                n_runs = 5)
save(out_hold, file = "real_data_evaluations_first_21_datasets_hold.RData", compress = TRUE)


## evaluate the SMOTE datasets w.r.t the fidelity and privacy metrics
## (the SMOTE datasets are generated internally in this function)
set.seed(12345)
out_smote <- EvaluateSmoteData(df_split = df_split,
                               n_runs = 5,
                               k = 5)
save(out_smote, file = "real_data_evaluations_first_21_datasets_smote.RData", compress = TRUE)


# load the feather datasets containing the noisy-MIAV-based synthetic versions
# of each data split of the original data
df_nmiav1 <- read_feather(paste0(data_path, "openml_cc18_syn_noisy_miav_0.05.feather"))
df_nmiav2 <- read_feather(paste0(data_path, "openml_cc18_syn_noisy_miav_0.1.feather"))
df_nmiav3 <- read_feather(paste0(data_path, "openml_cc18_syn_noisy_miav_0.15.feather"))
df_nmiav4 <- read_feather(paste0(data_path, "openml_cc18_syn_noisy_miav_0.2.feather"))
df_nmiav5 <- read_feather(paste0(data_path, "openml_cc18_syn_noisy_miav_0.25.feather"))
df_nmiav6 <- read_feather(paste0(data_path, "openml_cc18_syn_noisy_miav_0.3.feather"))

## evaluate the noise-MIAV datasets w.r.t the fidelity and privacy metrics

set.seed(12345)
out_nmiav1 <- EvaluateSyntheticData(df_split = df_split,
                                    df_synth = df_nmiav1,
                                    n_runs = 5)
save(out_nmiav1, file = "real_data_evaluations_first_21_datasets_noisy_miav_0.05.RData", compress = TRUE)


set.seed(12345)
out_nmiav2 <- EvaluateSyntheticData(df_split = df_split,
                                    df_synth = df_nmiav2,
                                    n_runs = 5)
save(out_nmiav2, file = "real_data_evaluations_first_21_datasets_noisy_miav_0.1.RData", compress = TRUE)


set.seed(12345)
out_nmiav3 <- EvaluateSyntheticData(df_split = df_split,
                                    df_synth = df_nmiav3,
                                    n_runs = 5)
save(out_nmiav3, file = "real_data_evaluations_first_21_datasets_noisy_miav_0.15.RData", compress = TRUE)


set.seed(12345)
out_nmiav4 <- EvaluateSyntheticData(df_split = df_split,
                                    df_synth = df_nmiav4,
                                    n_runs = 5)
save(out_nmiav4, file = "real_data_evaluations_first_21_datasets_noisy_miav_0.2.RData", compress = TRUE)


set.seed(12345)
out_nmiav5 <- EvaluateSyntheticData(df_split = df_split,
                                    df_synth = df_nmiav5,
                                    n_runs = 5)
save(out_nmiav5, file = "real_data_evaluations_first_21_datasets_noisy_miav_0.25.RData", compress = TRUE)


set.seed(12345)
out_nmiav6 <- EvaluateSyntheticData(df_split = df_split,
                                    df_synth = df_nmiav6,
                                    n_runs = 5)
save(out_nmiav6, file = "real_data_evaluations_first_21_datasets_noisy_miav_0.3.RData", compress = TRUE)


