import math
import pathlib
from typing import Optional

import numpy as np
import pandas as pd
from xgboost import XGBRegressor


def load_uci_data(
    n_features: Optional[int] = None,
):
    # taken from the BODi paper (https://arxiv.org/pdf/2303.01774.pdf)
    try:
        path = str(pathlib.Path(__file__).parent.parent.parent.resolve()) + "/data/slice_localization_data.csv"
        df = pd.read_csv(path, sep=",")
    except:
        raise ValueError(
            "Failed to load `slice_localization_data.csv`. The slice dataset can be downloaded "
            "from: https://archive.ics.uci.edu/ml/datasets/Relative+location+of+CT+slices+on+axial+axis"
        )
    data = df.to_numpy()

    # Get the input data
    X = data[:, :-1]
    X -= X.min(axis=0)
    X = X[:, X.max(axis=0) > 1e-6]  # Throw away constant dimensions
    X = X / (X.max(axis=0) - X.min(axis=0))
    X = 2 * X - 1
    assert X.min() == -1 and X.max() == 1

    # Standardize targets
    y = data[:, -1]
    y = (y - y.mean()) / y.std()

    # Only keep 10,000 data points and n_features features
    shuffled_indices = np.random.RandomState(0).permutation(X.shape[0])[:10_000]  # Use seed 0
    X, y = X[shuffled_indices], y[shuffled_indices]

    if n_features is not None:
        # Use Xgboost to figure out feature importances and keep only the most important features
        xgb = XGBRegressor(max_depth=8).fit(X, y)
        inds = (-xgb.feature_importances_).argsort()
        X = X[:, inds[:n_features]]

    # Train/Test split on a subset of the data
    train_n = int(math.floor(0.50 * X.shape[0]))
    train_x, train_y = X[:train_n], y[:train_n]
    test_x, test_y = X[train_n:], y[train_n:]

    return train_x, train_y, test_x, test_y
