import pandas as pd

datasets = ['attrition', 'breast_cancer_recurrence', 'heart_disease', 'income', 'pima_diabetes', 'bank_marketing', 'moral_machines']
sample = 1000
seed = 42

loaded_datasets = {}
for dataset in datasets:
    loaded_datasets.update({dataset:pd.read_parquet(f"data/natural_counterfactuals/{dataset}_counterfactual_dataset_balanced.parquet")})
print("Loaded datasets")

combined = pd.concat([loaded_datasets[x].sample(sample).reset_index(drop=True) for x in loaded_datasets.keys()])
print("Sampled and concatenated datasets")

combined = combined.sample(len(combined), random_state=seed).reset_index(drop=True)
combined.to_parquet('data/natural_counterfactuals/combined_dataset.parquet')
print(len(combined))
print("Save combined dataset")
