"""
Usage: Set datasets to use and N_SHUFFLES below, then invoke 'python shuffle.py' from the base cb_bakeoff directory.
"""

import argparse
import sys
import os
import subprocess

dataset_dir = 'multiclass/'
output_dir = 'multiclass_shuffled/'

datasets = ['ds_1041_10', 'ds_1110_23', 'ds_1113_23', 'ds_150_7', 'ds_153_2', 'ds_157_5', 'ds_158_5', 'ds_161_2', 'ds_162_2', 'ds_293_2', 'ds_389_17', 'ds_396_6', 'ds_399_10', 'ds_458_4', 'ds_554_10', 'ds_822_2', 'ds_971_2']

N_SHUFFLES = 10

if __name__ == '__main__':

    for ds in datasets:

        ds_path = os.path.join(dataset_dir, ds + ".vw.gz")
        if not os.path.exists(ds_path):
            print(ds_path, " does not exist.")
            continue


        for idx in range(N_SHUFFLES):

            shuf_name = ds + '_shuf{}'.format(idx)
            shuf_path = os.path.join(output_dir, shuf_name + ".vw.gz")

            if not os.path.exists(shuf_path):
                
                print("Writing to {}".format(shuf_path))
                
                subprocess.call('cat {} | gunzip | shuf | gzip > {}'.format(ds_path, shuf_path), shell=True)
