import os 
from tqdm import tqdm 
import random 
import shutil

random.seed(2024)

prop_simulated = 0.1

SIMULATED_PREPROCESSED_DIR = '/scratch/simulated_preprocessed_data_all'
TRAIN_PREPROCESSED_DIR = '/scratch/preprocess/train_march_new'
OUTPUT_PREPROCESSED_DIR = f'/scratch/finetuning_preprocessed_{prop_simulated}_all'

if not os.path.exists(OUTPUT_PREPROCESSED_DIR):
    os.makedirs(OUTPUT_PREPROCESSED_DIR, exist_ok=True)

sim_files = os.listdir(SIMULATED_PREPROCESSED_DIR)
assert len(sim_files) == 6000
train_files = os.listdir(TRAIN_PREPROCESSED_DIR)

num_train_files = len(train_files)
train_files = sorted(train_files)
random.shuffle(train_files)

num_real_files = int(((len(sim_files)) * (1 - prop_simulated) / prop_simulated))
real_files = train_files[:num_real_files]

print(f"For prop_simulated={prop_simulated}, we have {len(sim_files)} simulated scenarios and {len(real_files)} real scenarios.")

for sim_file in tqdm(sim_files):
    path = os.path.join(SIMULATED_PREPROCESSED_DIR, sim_file)
    output_path = os.path.join(OUTPUT_PREPROCESSED_DIR, sim_file)
    shutil.copyfile(path, output_path)

for real_file in tqdm(real_files):
    path = os.path.join(TRAIN_PREPROCESSED_DIR, real_file)
    output_path = os.path.join(OUTPUT_PREPROCESSED_DIR, real_file)
    shutil.copyfile(path, output_path)

assert len(os.listdir(OUTPUT_PREPROCESSED_DIR)) == len(real_files) + len(sim_files)
print("Done!")
