import numpy as np
import os
import urllib

DATA_DIR = os.environ.get(
    "COPT_DATA_DIR", os.path.join(os.path.expanduser("~"), "copt_data")
)

def load_realsim():
    """Download and return the realsim dataset.
  This is the binary classification version of the dataset as found in the
  LIBSVM dataset project:
      https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#real-sim
      (bridge to algebra)
  Returns:
    X : scipy.sparse CSR matrix
    y: numpy array
        Labels, only takes values 0 or 1.
  """
    from sklearn import datasets  # lazy import

    if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)
    file_path = os.path.join(DATA_DIR, "real-sim.bz2")
    if not os.path.exists(file_path):
        print("real-sim dataset is not present in data folder. Downloading it ...")
        url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/real-sim.bz2"
        urllib.request.urlretrieve(url, file_path)
        print("Finished downloading")

    X, y = datasets.load_svmlight_file(file_path)
    y = ((y + 1) // 2).astype(np.int)
    return X, y