#!/bin/bash

set -euo pipefail

preprocessed_X_train="data/ember/preprocessed/X_test.bin"
if [ -f "$preprocessed_X_train" ]; then
    echo "Preprocessed data already exists. Skipping..."
    exit 0
fi

echo "Preprocessing EMBER dataset..."
pip install git+https://github.com/elastic/ember.git
python3 experiment/preprocess/ember_preprocess.py

./scripts/setup.sh
./experiment/build/csv2bin -k data/ember/preprocessed/all_pos_key.csv -x data/ember/preprocessed/all_pos_X.csv
./experiment/build/csv2bin -k data/ember/preprocessed/X_train_key.csv -x data/ember/preprocessed/X_train.csv -y data/ember/preprocessed/y_train.csv
./experiment/build/csv2bin -k data/ember/preprocessed/X_val_key.csv -x data/ember/preprocessed/X_val.csv -y data/ember/preprocessed/y_val.csv
./experiment/build/csv2bin -k data/ember/preprocessed/X_test_key.csv -x data/ember/preprocessed/X_test.csv -y data/ember/preprocessed/y_test.csv
