{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.datasets import load_svmlight_file\n",
    "from numpy import savetxt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# change this before start!\n",
    "datasets_path_prefix = \"/Users/angelynaye/Desktop/research/data/adversarial-nonparametrics/nnattack/datasets/files/\"\n",
    "path_prefix = \"/Users/angelynaye/desktop/research/data/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# preprocess the data as the research do\n",
    "def preprocess(file_name, output_name, convert_float=False):\n",
    "    file_name = datasets_path_prefix + file_name #change this to the place where you store the data\n",
    "    X, y = load_svmlight_file(file_name)\n",
    "    X = X.todense()\n",
    "    if(convert_float):\n",
    "        X = X.astype(np.float)\n",
    "    y[y==-1] = 0\n",
    "    y[y==1] = 1\n",
    "    y = y.astype(int)\n",
    "    y_reshaped = y.reshape(-1,1)\n",
    "#     data_with_label = np.concatenate((X,y_reshaped), axis = 1) if we want to save data with labels\n",
    "    output_name1 = output_name + \".csv\"\n",
    "#     output_name2 = output_name + \"_with_y\" + \".csv\"  \n",
    "    print(output_name1 + \"--------------------------------------------\")\n",
    "    print(X.shape)\n",
    "    savetxt(path_prefix + output_name1, X, delimiter=' ')\n",
    "#     savetxt(path_prefix + output_name2, data_with_label, delimiter=' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "german.csv--------------------------------------------\n",
      "(1000, 24)\n",
      "splice.csv--------------------------------------------\n",
      "(1000, 60)\n",
      "svmguide3.csv--------------------------------------------\n",
      "(1243, 22)\n",
      "diabetes.csv--------------------------------------------\n",
      "(768, 8)\n",
      "fourclass.csv--------------------------------------------\n",
      "(862, 2)\n",
      "australian.csv--------------------------------------------\n",
      "(690, 14)\n",
      "cancer.csv--------------------------------------------\n",
      "(683, 10)\n",
      "ijcnn1.csv--------------------------------------------\n",
      "(35000, 22)\n",
      "covtype_bi.csv--------------------------------------------\n",
      "(581012, 54)\n"
     ]
    }
   ],
   "source": [
    "files = [(\"german.numer\", \"german\"),\n",
    "        (\"splice\", \"splice\"), \n",
    "        (\"svmguide3\", \"svmguide3\"),\n",
    "        (\"diabetes\", \"diabetes\"),\n",
    "        (\"fourclass\", \"fourclass\"),\n",
    "        (\"australian\", \"australian\"),\n",
    "        (\"breast-cancer\", \"cancer\"),\n",
    "        (\"ijcnn1.tr\", \"ijcnn1\"),\n",
    "        (\"covtype.libsvm.binary\",\"covtype_bi\")]\n",
    "for pairs in files:\n",
    "    preprocess(pairs[0], pairs[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(581012, 54)\n"
     ]
    }
   ],
   "source": [
    "#preprocess covtype.data\n",
    "data_cov = np.genfromtxt(datasets_path_prefix + 'covtype.data', delimiter=',')\n",
    "ty_cov = data_cov[:, -1].astype(int) - 1\n",
    "tX_cov = data_cov[:, :-1].astype(float)\n",
    "# ty_cov_reshaped= ty_cov.reshape(-1,1)\n",
    "# cov_dat = np.concatenate((tX_cov,ty_cov_reshaped), axis = 1)\n",
    "print(tX_cov.shape)\n",
    "savetxt(path_prefix +'cov_dat.csv', tX_cov, delimiter=' ')\n",
    "# savetxt('/Users/angelynaye/desktop/research/data/cov_dat_with_y', cov_dat, delimiter=' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(4177, 7)\n"
     ]
    }
   ],
   "source": [
    "#preprocess abalone\n",
    "data_aba = np.genfromtxt(datasets_path_prefix + 'abalone.data', dtype='str', delimiter=',')\n",
    "X_aba = [data_aba[i][1:8] for i in range(len(data_aba))]\n",
    "X_aba = np.array([list(map(float, X_aba[i])) for i in range(len(X_aba))])\n",
    "print(X_aba.shape)\n",
    "savetxt(path_prefix + 'abalone.csv', X_aba, delimiter=' ')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2300, 2)\n"
     ]
    }
   ],
   "source": [
    "# preprocess halfmoon dataset\n",
    "from sklearn.datasets import make_moons\n",
    "n_samples = 2000+100+200  # according to the research paper\n",
    "X_half_moon, y_half_moon = make_moons(n_samples=n_samples, noise=0.25,random_state=110)\n",
    "print(X_half_moon.shape)\n",
    "savetxt(path_prefix + 'halfmoon.csv', X_half_moon, delimiter=' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(12000, 25)\n"
     ]
    }
   ],
   "source": [
    "# preprocess f-mnist35\n",
    "from keras.datasets import fashion_mnist\n",
    "from sklearn.decomposition import PCA\n",
    "n_samples = 12000\n",
    "n_dims = 25\n",
    "\n",
    "(X, y), (_, _) =  fashion_mnist.load_data()\n",
    "X = X.reshape(len(X), -1)\n",
    "idx1 = np.random.choice(np.where(y==3)[0], n_samples//2, replace=False)\n",
    "idx2 = np.random.choice(np.where(y==5)[0], n_samples//2, replace=False)\n",
    "y = np.copy(y)\n",
    "y.setflags(write=1)\n",
    "y[idx1] = 0\n",
    "y[idx2] = 1\n",
    "X = np.vstack((X[idx1], X[idx2])).astype(np.float) / 255.\n",
    "y = np.concatenate((y[idx1], y[idx2]))\n",
    "\n",
    "if n_dims:\n",
    "    pca = PCA(n_components=n_dims,\n",
    "            random_state=110)\n",
    "    X = pca.fit_transform(X)\n",
    "print(X.shape)\n",
    "savetxt(path_prefix + 'f-mnist35.csv', X , delimiter=' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mnist17 train dataset shape: (13007, 25)\n",
      "mnist17 test dataset shape: (2163, 25)\n"
     ]
    }
   ],
   "source": [
    "# preprocess mnist17 dataset\n",
    "from keras.datasets import mnist\n",
    "n_dims = 25\n",
    "\n",
    "(X, y), (tX, ty) = mnist.load_data()\n",
    "\n",
    "def process(X, y):\n",
    "    X = X.reshape(len(X), -1).astype(np.float) / 255.\n",
    "    y = np.copy(y)\n",
    "    y.setflags(write=1)\n",
    "    idx1 = np.where(y==1)[0]\n",
    "    idx2 = np.where(y==7)[0]\n",
    "    y[idx1] = 0\n",
    "    y[idx2] = 1\n",
    "    X = np.vstack((X[idx1], X[idx2])).astype(np.float) / 255.\n",
    "    y = np.concatenate((y[idx1], y[idx2]))\n",
    "    return X, y\n",
    "X, y = process(X, y)\n",
    "tX, ty = process(tX, ty)\n",
    "\n",
    "if n_dims:\n",
    "    pca = PCA(n_components=n_dims,\n",
    "            random_state=110)\n",
    "    ttX = pca.fit_transform(np.vstack((X, tX)))\n",
    "    X, tX = ttX[:len(X)], ttX[len(X):]\n",
    "\n",
    "X_mnist17 = np.vstack((X,tX))    \n",
    "print(\"mnist17 train dataset shape: \" + str(X.shape))\n",
    "print(\"mnist17 test dataset shape: \" + str(tX.shape))\n",
    "    \n",
    "savetxt(path_prefix + 'mnist17_train.csv', X, delimiter=' ')\n",
    "savetxt(path_prefix + 'mnist17_test.csv', tX, delimiter=' ')\n",
    "savetxt(path_prefix + 'mnist17.csv', X_mnist06, delimiter=' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "f-mnist06 train dataset shape: (12000, 25)\n",
      "f-mnist06 test dataset shape: (2000, 25)\n"
     ]
    }
   ],
   "source": [
    "# preprocess f-mnist06 data\n",
    "from keras.datasets import fashion_mnist\n",
    "n_dims = 25\n",
    "\n",
    "(X, y), (tX, ty) = fashion_mnist.load_data()\n",
    "\n",
    "def process(X, y):\n",
    "    X = X.reshape(len(X), -1).astype(np.float) / 255.\n",
    "    y = np.copy(y)\n",
    "    y.setflags(write=1)\n",
    "    idx1 = np.where(y==0)[0]\n",
    "    idx2 = np.where(y==6)[0]\n",
    "    y[idx1] = 0\n",
    "    y[idx2] = 1\n",
    "    X = np.vstack((X[idx1], X[idx2])).astype(np.float) / 255.\n",
    "    y = np.concatenate((y[idx1], y[idx2]))\n",
    "    return X, y\n",
    "X, y = process(X, y)\n",
    "tX, ty = process(tX, ty)\n",
    "\n",
    "if n_dims:\n",
    "    pca = PCA(n_components=n_dims,\n",
    "            random_state=110)\n",
    "    ttX = pca.fit_transform(np.vstack((X, tX)))\n",
    "    X, tX = ttX[:len(X)], ttX[len(X):]\n",
    "\n",
    "X_mnist06 = np.vstack((X,tX))\n",
    "print(\"f-mnist06 train dataset shape: \" + str(X.shape))\n",
    "print(\"f-mnist06 test dataset shape: \" + str(tX.shape))\n",
    "    \n",
    "savetxt(path_prefix + 'f-mnist06_train.csv', X, delimiter=' ')\n",
    "savetxt(path_prefix + 'f-mnist06_test.csv', tX, delimiter=' ')\n",
    "savetxt(path_prefix + 'f-mnist06.csv', X_mnist06, delimiter=' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
