"""Data downloaded from
https://archive.ics.uci.edu/dataset/199/miniboone+particle+identification
on 23rd February 2024. The data is from the MiniBooNE experiment at Fermilab.
Labels are 1 for signal, 0 for background. The positive signal is a muon
neutrino oscillating into an electron neutrino.
The data is a binary classification problem, with 130,064 examples and 50 features.
First 36,499 are positive examples, the remaining 93,565 are negative.

Folder should already exist so no need to mkdir.
We use 40000 negative examples to enforce class balance, so the dataset size
is 76499. We use val and test size = 10,000 so train size is 56499.
"""


import os
import os.path as osp

import numpy as np
import pandas as pd
import torch

from datasets.preprocessing_utils import preprocess_and_save_data


SEED = 4903
np.random.seed(SEED)
path = osp.join("datasets", "data", "miniboone")
val_size = 10000
test_size = 10000


if __name__ == "__main__":
  # Load in miniboone raw csv.
  try:
    print("Loading raw data.")
    X = pd.read_csv(osp.join(path, "raw.txt"), sep="\s+", skiprows=1, header=None).to_numpy()
    num_pos_neg = pd.read_csv(osp.join(path, "raw.txt"), sep="\s+", nrows=1, header=None).to_numpy()
    print("Data loaded.")
  except FileNotFoundError:
    raise Exception("Please download the miniboone data from https://archive.ics.uci.edu/dataset/199/miniboone+particle+identification")

  # Find how many positive and how many negative.
  num_positive = num_pos_neg[0, 0]
  num_negative = num_pos_neg[0, 1]

  # Separate positive and negative examples, shuffle negative and select roughly
  # same number as positive.
  X_pos = X[:num_positive]
  X_neg = X[num_positive:]
  shuffle_ids = np.random.permutation(X_neg.shape[0])
  X_neg = X_neg[shuffle_ids]
  X_neg = X_neg[:round(num_positive, -4)]  # Round to the nearest 10000 (40000).

  # Construct the X and y data and convert to tensors.
  X = np.concatenate([X_pos, X_neg], axis=0)
  y = np.zeros(shape=(X.shape[0]))
  y[:num_positive] = 1.0

  X = torch.tensor(X).float()
  y = torch.tensor(y).long()

  # Extract the best features based on XGBoost feature importances.
  best_features = np.array([14, 34, 21, 3, 42, 23, 6, 29, 39, 44,
                            22, 20, 25, 43, 17, 15, 41, 26, 40, 2])
  X = X[:, best_features]

  dataset_dict = {
    "num_con_features": X.shape[1],
    "num_cat_features": 0,
    "most_categories": 0,
    "out_dim": 2,
    "metric": "auroc",
    "max_dim": None,
  }

  preprocess_and_save_data(
    path=path,
    dataset_dict=dataset_dict,
    train_size=X.shape[0] - (val_size + test_size),
    val_size=val_size,
    X=X,
    y=y,
    M=None,
    shuffle=True,
    num_bins=200,
    size_normal=1e-5,
    ratio_uniform=0.05,
  )




