{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import os\n",
    "import random\n",
    "import matplotlib.pyplot as plt\n",
    "import time\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.utils import shuffle\n",
    "from sklearn.datasets import make_classification\n",
    "from scipy.stats import multivariate_normal\n",
    "from scipy.stats import gaussian_kde\n",
    "from Data_Generation import *\n",
    "from TV_estimation import *\n",
    "from sklearn.neural_network import MLPRegressor\n",
    "from scipy.stats import dirichlet\n",
    "from scipy.stats import gamma\n",
    "from scipy.optimize import minimize"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Exponential distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def Exponential_Sample(s,scale_1, scale_2, sed=1):\n",
    "\n",
    "    np.random.seed(sed)\n",
    "    # x_real = np.random.multivariate_normal(mean=mu_1, cov=Sigma_1, size=s)\n",
    "    # x_syn = np.random.multivariate_normal(mean=mu_2, cov=Sigma_2, size=s)\n",
    "    x_real = np.random.exponential(scale_1, size = s)  \n",
    "    x_syn = np.random.exponential(scale_2, size = s)   \n",
    "    y_real = np.ones(s)\n",
    "    y_syn = np.zeros(s)\n",
    "    X_raw = np.concatenate([x_real,x_syn])\n",
    "    Y_raw = np.concatenate([y_real,y_syn])\n",
    "    return([X_raw,Y_raw])\n",
    "\n",
    "\n",
    "def Psi_Trans(X):\n",
    "    X_new = list(X)\n",
    "    return(X_new)\n",
    "\n",
    "\n",
    "def dividing_train_test(x, y, s, train_size, test_size):\n",
    "    train_ind = random.choices(range(0, s*2), k = train_size)\n",
    "    x_train = x[train_ind]\n",
    "    y_train = y[train_ind]\n",
    "\n",
    "    test_ind = random.choices(range(0, s*2), k = test_size)\n",
    "    x_test = x[test_ind]\n",
    "    y_test = y[test_ind]\n",
    "    return(x_train, y_train, x_test, y_test)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def Dist_TV(x_train, x_test, y_train, y_test,seed = 1):\n",
    "    # CL_TV\n",
    "    # x - transformation\n",
    "    x_train_trans = np.apply_along_axis(Psi_Trans, axis=1, arr=x_train)\n",
    "    x_test_trans = np.apply_along_axis(Psi_Trans, axis=1, arr=x_test)\n",
    "    f_hat = MLPRegressor(hidden_layer_sizes=1,\n",
    "                        tol=1e-7, alpha=0.00001,\n",
    "                        activation='logistic', random_state=seed, max_iter=10000)\n",
    "    f_hat.fit(x_train_trans, y_train)\n",
    "    # predict labels for the testing data\n",
    "    y = f_hat.predict(x_test_trans)\n",
    "    y_pred = np.array([1 if val > 0.5 else 0 for val in y])\n",
    "    # calculate misclassification rate\n",
    "    misclassification_rate = np.mean(y_pred != y_test.reshape(1,-1)[0])\n",
    "    TV_est_CL = abs(1-2*misclassification_rate)\n",
    "    return(TV_est_CL)\n",
    "\n",
    "\n",
    "def MC_TV_Baseline(scale_1, scale_2, seed = 1):\n",
    "    x_sample = np.concatenate([np.random.exponential(scale_1, size = 100000), np.random.exponential(scale_2, size = 100000)]).reshape((200000, 1))\n",
    "    def mvn_P(x):\n",
    "        return expon.pdf(x, scale = scale_1)\n",
    "    \n",
    "    def mvn_Q(x):\n",
    "        return expon.pdf(x, scale = scale_2)\n",
    "    \n",
    "    P_val = np.apply_along_axis(mvn_P, axis=1, arr=x_sample)\n",
    "    Q_val = np.apply_along_axis(mvn_Q, axis=1, arr=x_sample)\n",
    "    TV_est_baseline = np.abs(P_val / (P_val+Q_val) -Q_val / (P_val+Q_val)).mean()\n",
    "    return(TV_est_baseline)\n",
    "\n",
    "\n",
    "def KNN(x_real, x_syn):\n",
    "    T = x_syn.shape[0] # number of samples in x_syn\n",
    "    M = min(int(T/2), int(x_real.shape[0]/2)) \n",
    "    N = T-M\n",
    "    X_sample = x_syn[range(M), :]\n",
    "    Y_1 = x_real[range(M), :] # take M samples of x_real\n",
    "    Y_2 = x_syn[range(M,T), :] # divide x_syn into two sets: X and Y_1\n",
    "    d = x_real.shape[1] # dimension\n",
    "    k = int(M**0.5) # number of neighbors, optimal choice of k\n",
    "    # Initialize the NearestNeighbors model\n",
    "    nbrs = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(Y_1)\n",
    "    # Find k nearest neighbors and distances for each element in X_sample\n",
    "    distances, indices = nbrs.kneighbors(X_sample)\n",
    "    rho_1 = distances[:, -1]\n",
    "    # Initialize the NearestNeighbors model\n",
    "    nbrs = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(Y_2)\n",
    "    # Find k nearest neighbors and distances for each element in X_sample\n",
    "    distances, indices = nbrs.kneighbors(X_sample)\n",
    "    rho_2 = distances[:, -1]\n",
    "    L = (rho_2/ rho_1)**d\n",
    "    g = 0.5 * np.abs(L- 1)\n",
    "    value = np.mean(g)\n",
    "    return value\n",
    "\n",
    "\n",
    "def NNRE(x_real, x_syn):\n",
    "    # Z = [X; Y] where X and Y are matrices with N rows and d columns.\n",
    "    # N is the number of samples in X and Y, and d is the dimension.\n",
    "    # IDX is a matrix, where rows are different nodes and columns are indices of KNNs.\n",
    "    # The first index is the point itself, so we take k+1 nearest neighbors.\n",
    "    Z = np.vstack((x_real, x_syn))\n",
    "    N = x_real.shape[0] # number of samples in x_real\n",
    "    M = x_syn.shape[0] # number of samples in x_syn\n",
    "    d = x_real.shape[1] # dimension\n",
    "    k = int(N**0.5) # number of neighbors\n",
    "    # Calculate k nearest neighbors\n",
    "    nbrs = NearestNeighbors(n_neighbors=k+1).fit(Z)\n",
    "    distances, indices = nbrs.kneighbors(Z)\n",
    "    indices_0 = indices[:,1::]\n",
    "    # For each row (node), obtain how many of KNN are of the set X (those who have indices < N)\n",
    "    Temp = (indices <= N)\n",
    "    # Temp2 is the number of indices from X set.\n",
    "    Temp2 = np.sum(Temp, axis=1)\n",
    "    Rat = (Temp2) / (k - Temp2 + 1)\n",
    "    Temp3 = 0.5 * np.abs(M/N * (Rat[N:N+M]) - 1)\n",
    "    #Temp3 = 0.5 * np.abs(np.array([max(i, 1 / 2) for i in Rat[N:N + M]])-1)\n",
    "    # Average over KNN ratios of Y set\n",
    "    value = np.mean(Temp3)\n",
    "    return(value)\n",
    "\n",
    "\n",
    "def KDE_TV(x_real, x_syn):\n",
    "    kde_real = gaussian_kde(x_real.T, bw_method='silverman')\n",
    "    kde_syn = gaussian_kde(x_syn.T, bw_method='silverman')\n",
    "    sample_size = 50000\n",
    "    sample_1 = kde_real.resample(size = int(sample_size/2)).T\n",
    "    sample_2 = kde_syn.resample(size = int(sample_size/2)).T\n",
    "    x_sample = np.concatenate((sample_1, sample_2))\n",
    "    # print(x_sample.shape)\n",
    "    density_real = kde_real(x_sample.T)\n",
    "    density_syn = kde_syn(x_sample.T)\n",
    "    KDE_est_tv = np.abs(density_real / (density_real + density_syn) - density_syn / (density_real + density_syn)).mean()\n",
    "    # print(KDE_est_tv)\n",
    "    return(KDE_est_tv)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "m = 1\n",
    "num_diff = 20\n",
    "diff = np.linspace(30, 50, num_diff)\n",
    "num_rep = 10  \n",
    "\n",
    "TV_est_base_mat = np.zeros((num_diff, num_rep))\n",
    "TV_est_CL_mat = np.zeros((num_diff, num_rep))\n",
    "TV_est_MC_mat = np.zeros((num_diff, num_rep))\n",
    "TV_est_KDE_mat = np.zeros((num_diff, num_rep))\n",
    "TV_est_NNR_mat = np.zeros((num_diff, num_rep))\n",
    "TV_est_KNN_mat = np.zeros((num_diff, num_rep))\n",
    "\n",
    "time_base_vec = np.zeros(num_diff)\n",
    "time_CL_vec = np.zeros(num_diff)\n",
    "time_MC_vec = np.zeros(num_diff)\n",
    "time_KDE_vec = np.zeros(num_diff)\n",
    "time_NNR_vec = np.zeros(num_diff)\n",
    "time_KNN_vec = np.zeros(num_diff)\n",
    "\n",
    "\n",
    "for i in range(num_diff):\n",
    "    ## set up\n",
    "    TV_est_base_vec = np.zeros(num_rep)\n",
    "    TV_est_CL_vec = np.zeros(num_rep)\n",
    "    TV_est_MC_vec = np.zeros(num_rep)\n",
    "    TV_est_KDE_vec = np.zeros(num_rep)\n",
    "    TV_est_NNR_vec = np.zeros(num_rep)\n",
    "    TV_est_KNN_vec = np.zeros(num_rep)\n",
    "  \n",
    "    time_base = np.zeros(num_rep)\n",
    "    time_CL = np.zeros(num_rep)\n",
    "    time_MC = np.zeros(num_rep)\n",
    "    time_KDE = np.zeros(num_rep)\n",
    "    time_NNR = np.zeros(num_rep)\n",
    "    time_KNN = np.zeros(num_rep)\n",
    "    \n",
    "\n",
    "    ## raw data generation\n",
    "    scale_1 = 1\n",
    "    scale_2 = 1 + diff[i]\n",
    "    print(i, \": The difference shape_2 - shape_1 = \", diff[i])\n",
    "    train_data = Exponential_Sample(100000,scale_1, scale_2, sed=i)\n",
    "    x_raw = train_data[0].reshape((2*100000, 1))\n",
    "    y_raw = train_data[1].reshape((2*100000, 1))\n",
    "    test_data = Exponential_Sample(500000,scale_1, scale_2, sed=i)\n",
    "    x_test = test_data[0].reshape((2*500000, 1))\n",
    "    y_test = test_data[1].reshape((2*500000, 1))\n",
    "\n",
    "\n",
    "    for k in range(num_rep):\n",
    "        train_size = 10000\n",
    "        train_ind = random.choices(range(0, 200000), k = train_size)\n",
    "        x_train = x_raw[train_ind]\n",
    "        y_train = y_raw[train_ind]\n",
    "        x_real, x_syn = x_train[y_train == 1], x_train[y_train == 0]\n",
    "        x_real = x_real.reshape((x_real.shape[0], 1))\n",
    "        x_syn = x_syn.reshape((x_syn.shape[0], 1))\n",
    "\n",
    "        ## base estimator\n",
    "        start_time = time.time()    # Record the end time\n",
    "        MC_TV_baseline = MC_TV_Baseline(scale_1, scale_2, seed = k)\n",
    "        end_time = time.time()    # Record the end time\n",
    "        time_base[k] = end_time - start_time\n",
    "        TV_est_base_vec[k] = MC_TV_baseline\n",
    "\n",
    "        ## Classifier estimator\n",
    "        start_time = time.time()    # Record the end time\n",
    "        CL_TV_result = Dist_TV(x_train, x_test, y_train, y_test,seed = k)\n",
    "        end_time = time.time()    # Record the end time\n",
    "        time_CL[k] = end_time - start_time\n",
    "        TV_est_CL_vec[k] = CL_TV_result\n",
    "\n",
    "        ## MC (parameter estimation) estimator\n",
    "        real_mean = np.mean(x_real)\n",
    "        syn_mean = np.mean(x_syn)\n",
    "        scale_1_bar = real_mean\n",
    "        scale_2_bar = syn_mean\n",
    "\n",
    "        start_time = time.time()    # Record the end time\n",
    "        MC_TV_result = MC_TV_Baseline(scale_1_bar, scale_2_bar, seed = k)\n",
    "        end_time = time.time()    # Record the end time\n",
    "        time_MC[k] = end_time - start_time\n",
    "        TV_est_MC_vec[k] = MC_TV_result\n",
    "\n",
    "        ## KDE estimator\n",
    "        start_time = time.time()    # Record the end time\n",
    "        KDE_TV_result = KDE_TV(x_real, x_syn)\n",
    "        end_time = time.time()    # Record the end time\n",
    "        time_KDE[k] = end_time - start_time\n",
    "        TV_est_KDE_vec[k] = KDE_TV_result\n",
    "\n",
    "        ## NNR estimator\n",
    "        start_time = time.time()    # Record the end time\n",
    "        NNR_TV_result = NNRE(x_real, x_syn)\n",
    "        end_time = time.time()    # Record the end time\n",
    "        time_NNR[k] = end_time - start_time\n",
    "        TV_est_NNR_vec[k] = NNR_TV_result\n",
    "\n",
    "        ## KNN estimator\n",
    "        start_time = time.time()    # Record the end time\n",
    "        KNN_TV_result = KNN(x_real, x_syn)\n",
    "        end_time = time.time()    # Record the end time\n",
    "        time_KNN[k] = end_time - start_time\n",
    "        TV_est_KNN_vec[k] = KNN_TV_result\n",
    "\n",
    "\n",
    "    TV_est_base_mat[i,:] = TV_est_base_vec\n",
    "    TV_est_CL_mat[i,:] = TV_est_CL_vec\n",
    "    TV_est_MC_mat[i,:] = TV_est_MC_vec\n",
    "    TV_est_KDE_mat[i,:] = TV_est_KDE_vec\n",
    "    TV_est_NNR_mat[i,:] = TV_est_NNR_vec\n",
    "    TV_est_KNN_mat[i,:] = TV_est_KNN_vec\n",
    "\n",
    "    \n",
    "    print(\"Total valuation (TV_est_base) is : \",TV_est_base_vec)\n",
    "    print(\"Total valuation (TV_est_CL) is : \", TV_est_CL_vec)\n",
    "    print(\"Total valuation (TV_est_MC) is : \", TV_est_MC_vec)\n",
    "    print(\"Total valuation (TV_est_KDE) is : \", TV_est_KDE_vec)\n",
    "    print(\"Total valuation (TV_est_NNR) is : \", TV_est_NNR_vec)\n",
    "    print(\"Total valuation (TV_est_KNN) is : \", TV_est_KNN_vec)\n",
    "\n",
    "\n",
    "TV_est_list = [TV_est_base_mat, TV_est_CL_mat, TV_est_MC_mat, TV_est_KDE_mat, TV_est_NNR_mat, TV_est_KNN_mat]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Gamma distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def Gamma_Sample(s,shape_1, scale_1, shape_2, scale_2, sed=1):\n",
    "\n",
    "    np.random.seed(sed)\n",
    "    # x_real = np.random.multivariate_normal(mean=mu_1, cov=Sigma_1, size=s)\n",
    "    # x_syn = np.random.multivariate_normal(mean=mu_2, cov=Sigma_2, size=s)\n",
    "    x_real = np.random.gamma(shape_1, scale_1, size = s)  # mean = shape*scale, var = shape*sqrt(scale)\n",
    "    x_syn = np.random.gamma(shape_2, scale_2, size = s)   # mean = shape*scale, var = shape*sqrt(scale)\n",
    "    y_real = np.ones(s)\n",
    "    y_syn = np.zeros(s)\n",
    "    X_raw = np.concatenate([x_real,x_syn])\n",
    "    Y_raw = np.concatenate([y_real,y_syn])\n",
    "    return([X_raw,Y_raw])\n",
    "\n",
    "\n",
    "def Psi_Trans(X):\n",
    "    X_new = list(X)\n",
    "    X_new.extend(list(np.log(X)))\n",
    "    return(X_new)\n",
    "\n",
    "\n",
    "def dividing_train_test(x, y, s, train_size, test_size):\n",
    "    train_ind = random.choices(range(0, s*2), k = train_size)\n",
    "    x_train = x[train_ind]\n",
    "    y_train = y[train_ind]\n",
    "\n",
    "    test_ind = random.choices(range(0, s*2), k = test_size)\n",
    "    x_test = x[test_ind]\n",
    "    y_test = y[test_ind]\n",
    "    return(x_train, y_train, x_test, y_test)\n",
    "\n",
    "\n",
    "def moment_estimates(data):\n",
    "    \"\"\"\n",
    "    Estimate Dirichlet distribution parameters using the method of moments.\n",
    "    \n",
    "    Parameters:\n",
    "    - data: Observed data points\n",
    "    \n",
    "    Returns:\n",
    "    - Estimated parameters of the Dirichlet distribution\n",
    "    \"\"\"\n",
    "    # Calculate sample moments\n",
    "    sample_means = np.mean(data, axis=0)\n",
    "    sample_vars = np.var(data, axis=0, ddof=1)\n",
    "    sample_mean_of_log = np.mean(np.log(data), axis=0)\n",
    "    \n",
    "    def equations(alpha):\n",
    "        alpha_0 = np.sum(alpha)\n",
    "        expected_mean = alpha / alpha_0\n",
    "        expected_var = (alpha * (alpha_0 - alpha)) / (alpha_0**2 * (alpha_0 + 1))\n",
    "        return np.concatenate((expected_mean - sample_means, expected_var - sample_vars))\n",
    "    \n",
    "    # Initial guess for alpha\n",
    "    alpha_init = np.ones(data.shape[1])\n",
    "    \n",
    "    # Solve the equations\n",
    "    result = minimize(lambda alpha: np.sum(equations(alpha)**2), alpha_init, method='L-BFGS-B', bounds=[(1e-10, None)]*data.shape[1])\n",
    "    \n",
    "    return result.x\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def Dist_TV(x_train, x_test, y_train, y_test,seed = 1):\n",
    "    # CL_TV\n",
    "    # x - transformation\n",
    "    x_train_trans = np.apply_along_axis(Psi_Trans, axis=1, arr=x_train)\n",
    "    x_test_trans = np.apply_along_axis(Psi_Trans, axis=1, arr=x_test)\n",
    "    f_hat = MLPRegressor(hidden_layer_sizes=1,\n",
    "                        tol=1e-7, alpha=0.00001,\n",
    "                        activation='logistic', random_state=seed, max_iter=10000)\n",
    "    f_hat.fit(x_train_trans, y_train)\n",
    "    # predict labels for the testing data\n",
    "    y = f_hat.predict(x_test_trans)\n",
    "    y_pred = np.array([1 if val > 0.5 else 0 for val in y])\n",
    "    # calculate misclassification rate\n",
    "    misclassification_rate = np.mean(y_pred != y_test.reshape(1,-1)[0])\n",
    "    TV_est_CL = abs(1-2*misclassification_rate)\n",
    "    return(TV_est_CL)\n",
    "\n",
    "\n",
    "def MC_TV_Baseline(shape_1, scale_1, shape_2, scale_2, seed = 1):\n",
    "    x_sample = np.concatenate([np.random.gamma(shape_1, scale_1, size = 100000), np.random.gamma(shape_2, scale_2, size = 100000)]).reshape((200000, 1))\n",
    "    # calculate pdf\n",
    "    # mvn_P = multivariate_normal(mean=mu_1, cov=Sigma_1)  ## P(x) real data distribution\n",
    "    # mvn_Q = multivariate_normal(mean=mu_2, cov=Sigma_2)  ## Q(x) synthetic data distribution\n",
    "    def mvn_P(x):\n",
    "        return gamma.pdf(x, a = shape_1, scale = scale_1)\n",
    "    \n",
    "    def mvn_Q(x):\n",
    "        return gamma.pdf(x, a = shape_2, scale = scale_2)\n",
    "    \n",
    "    P_val = np.apply_along_axis(mvn_P, axis=1, arr=x_sample)\n",
    "    Q_val = np.apply_along_axis(mvn_Q, axis=1, arr=x_sample)\n",
    "    TV_est_baseline = np.abs(P_val / (P_val+Q_val) -Q_val / (P_val+Q_val)).mean()\n",
    "    return(TV_est_baseline)\n",
    "\n",
    "\n",
    "def KNN(x_real, x_syn):\n",
    "    T = x_syn.shape[0] # number of samples in x_syn\n",
    "    M = min(int(T/2), int(x_real.shape[0]/2)) \n",
    "    N = T-M\n",
    "    X_sample = x_syn[range(M), :]\n",
    "    Y_1 = x_real[range(M), :] # take M samples of x_real\n",
    "    Y_2 = x_syn[range(M,T), :] # divide x_syn into two sets: X and Y_1\n",
    "    d = x_real.shape[1] # dimension\n",
    "    k = int(M**0.5) # number of neighbors, optimal choice of k\n",
    "    # Initialize the NearestNeighbors model\n",
    "    nbrs = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(Y_1)\n",
    "    # Find k nearest neighbors and distances for each element in X_sample\n",
    "    distances, indices = nbrs.kneighbors(X_sample)\n",
    "    rho_1 = distances[:, -1]\n",
    "    # Initialize the NearestNeighbors model\n",
    "    nbrs = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(Y_2)\n",
    "    # Find k nearest neighbors and distances for each element in X_sample\n",
    "    distances, indices = nbrs.kneighbors(X_sample)\n",
    "    rho_2 = distances[:, -1]\n",
    "    L = (rho_2/ rho_1)**d\n",
    "    g = 0.5 * np.abs(L- 1)\n",
    "    value = np.mean(g)\n",
    "    return value\n",
    "\n",
    "\n",
    "def NNRE(x_real, x_syn):\n",
    "    # Z = [X; Y] where X and Y are matrices with N rows and d columns.\n",
    "    # N is the number of samples in X and Y, and d is the dimension.\n",
    "    # IDX is a matrix, where rows are different nodes and columns are indices of KNNs.\n",
    "    # The first index is the point itself, so we take k+1 nearest neighbors.\n",
    "    Z = np.vstack((x_real, x_syn))\n",
    "    N = x_real.shape[0] # number of samples in x_real\n",
    "    M = x_syn.shape[0] # number of samples in x_syn\n",
    "    d = x_real.shape[1] # dimension\n",
    "    k = int(N**0.5) # number of neighbors\n",
    "    # Calculate k nearest neighbors\n",
    "    nbrs = NearestNeighbors(n_neighbors=k+1).fit(Z)\n",
    "    distances, indices = nbrs.kneighbors(Z)\n",
    "    indices_0 = indices[:,1::]\n",
    "    # For each row (node), obtain how many of KNN are of the set X (those who have indices < N)\n",
    "    Temp = (indices <= N)\n",
    "    # Temp2 is the number of indices from X set.\n",
    "    Temp2 = np.sum(Temp, axis=1)\n",
    "    Rat = (Temp2) / (k - Temp2 + 1)\n",
    "    Temp3 = 0.5 * np.abs(M/N * (Rat[N:N+M]) - 1)\n",
    "    #Temp3 = 0.5 * np.abs(np.array([max(i, 1 / 2) for i in Rat[N:N + M]])-1)\n",
    "    # Average over KNN ratios of Y set\n",
    "    value = np.mean(Temp3)\n",
    "    return(value)\n",
    "\n",
    "\n",
    "def KDE_TV(x_real, x_syn):\n",
    "    kde_real = gaussian_kde(x_real.T, bw_method='silverman')\n",
    "    kde_syn = gaussian_kde(x_syn.T, bw_method='silverman')\n",
    "    sample_size = 50000\n",
    "    sample_1 = kde_real.resample(size = int(sample_size/2)).T\n",
    "    sample_2 = kde_syn.resample(size = int(sample_size/2)).T\n",
    "    x_sample = np.concatenate((sample_1, sample_2))\n",
    "    # print(x_sample.shape)\n",
    "    density_real = kde_real(x_sample.T)\n",
    "    density_syn = kde_syn(x_sample.T)\n",
    "    KDE_est_tv = np.abs(density_real / (density_real + density_syn) - density_syn / (density_real + density_syn)).mean()\n",
    "    # print(KDE_est_tv)\n",
    "    return(KDE_est_tv)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "m = 1\n",
    "num_diff = 20\n",
    "diff = np.linspace(0, 4, num_diff)\n",
    "num_rep = 10  \n",
    "\n",
    "TV_est_base_mat = np.zeros((num_diff, num_rep))\n",
    "TV_est_CL_mat = np.zeros((num_diff, num_rep))\n",
    "TV_est_MC_mat = np.zeros((num_diff, num_rep))\n",
    "TV_est_KDE_mat = np.zeros((num_diff, num_rep))\n",
    "TV_est_NNR_mat = np.zeros((num_diff, num_rep))\n",
    "TV_est_KNN_mat = np.zeros((num_diff, num_rep))\n",
    "\n",
    "time_base_vec = np.zeros(num_diff)\n",
    "time_CL_vec = np.zeros(num_diff)\n",
    "time_MC_vec = np.zeros(num_diff)\n",
    "time_KDE_vec = np.zeros(num_diff)\n",
    "time_NNR_vec = np.zeros(num_diff)\n",
    "time_KNN_vec = np.zeros(num_diff)\n",
    "\n",
    "\n",
    "for i in range(num_diff):\n",
    "    ## set up\n",
    "    TV_est_base_vec = np.zeros(num_rep)\n",
    "    TV_est_CL_vec = np.zeros(num_rep)\n",
    "    TV_est_MC_vec = np.zeros(num_rep)\n",
    "    TV_est_KDE_vec = np.zeros(num_rep)\n",
    "    TV_est_NNR_vec = np.zeros(num_rep)\n",
    "    TV_est_KNN_vec = np.zeros(num_rep)\n",
    "  \n",
    "    time_base = np.zeros(num_rep)\n",
    "    time_CL = np.zeros(num_rep)\n",
    "    time_MC = np.zeros(num_rep)\n",
    "    time_KDE = np.zeros(num_rep)\n",
    "    time_NNR = np.zeros(num_rep)\n",
    "    time_KNN = np.zeros(num_rep)\n",
    "    \n",
    "\n",
    "    ## raw data generation\n",
    "    shape_1, scale_1 = 1, 2\n",
    "    shape_2, scale_2 = 1 + diff[i], 2 + diff[i]\n",
    "    print(i, \": The difference shape_2 - shape_1 = \", diff[i])\n",
    "    train_data = Gamma_Sample(100000,shape_1, scale_1, shape_2, scale_2, sed=i)\n",
    "    x_raw = train_data[0].reshape((2*100000, 1))\n",
    "    y_raw = train_data[1].reshape((2*100000, 1))\n",
    "    test_data = Gamma_Sample(500000,shape_1, scale_1, shape_2, scale_2, sed=i)\n",
    "    x_test = test_data[0].reshape((2*500000, 1))\n",
    "    y_test = test_data[1].reshape((2*500000, 1))\n",
    "\n",
    "\n",
    "    for k in range(num_rep):\n",
    "        train_size = 10000\n",
    "        train_ind = random.choices(range(0, 200000), k = train_size)\n",
    "        x_train = x_raw[train_ind]\n",
    "        y_train = y_raw[train_ind]\n",
    "        x_real, x_syn = x_train[y_train == 1], x_train[y_train == 0]\n",
    "        x_real = x_real.reshape((x_real.shape[0], 1))\n",
    "        x_syn = x_syn.reshape((x_syn.shape[0], 1))\n",
    "\n",
    "        ## base estimator\n",
    "        start_time = time.time()    # Record the end time\n",
    "        MC_TV_baseline = MC_TV_Baseline(shape_1, scale_1, shape_2, scale_2, seed = k)\n",
    "        end_time = time.time()    # Record the end time\n",
    "        time_base[k] = end_time - start_time\n",
    "        TV_est_base_vec[k] = MC_TV_baseline\n",
    "\n",
    "        ## Classifier estimator\n",
    "        start_time = time.time()    # Record the end time\n",
    "        CL_TV_result = Dist_TV(x_train, x_test, y_train, y_test,seed = k)\n",
    "        end_time = time.time()    # Record the end time\n",
    "        time_CL[k] = end_time - start_time\n",
    "        TV_est_CL_vec[k] = CL_TV_result\n",
    "\n",
    "        ## MC (parameter estimation) estimator\n",
    "        real_mean = np.mean(x_real)\n",
    "        syn_mean = np.mean(x_syn)\n",
    "        real_var = np.var(x_real)\n",
    "        syn_var = np.var(x_syn)\n",
    "        scale_1_bar = real_var / real_mean\n",
    "        shape_1_bar = real_mean**2 / real_var\n",
    "        scale_2_bar = syn_var / syn_mean\n",
    "        shape_2_bar = syn_mean**2 / syn_var\n",
    "\n",
    "        start_time = time.time()    # Record the end time\n",
    "        MC_TV_result = MC_TV_Baseline(shape_1_bar, scale_1_bar, shape_2_bar, scale_2_bar, seed = k)\n",
    "        end_time = time.time()    # Record the end time\n",
    "        time_MC[k] = end_time - start_time\n",
    "        TV_est_MC_vec[k] = MC_TV_result\n",
    "\n",
    "        ## KDE estimator\n",
    "        start_time = time.time()    # Record the end time\n",
    "        KDE_TV_result = KDE_TV(x_real, x_syn)\n",
    "        end_time = time.time()    # Record the end time\n",
    "        time_KDE[k] = end_time - start_time\n",
    "        TV_est_KDE_vec[k] = KDE_TV_result\n",
    "\n",
    "        ## NNR estimator\n",
    "        start_time = time.time()    # Record the end time\n",
    "        NNR_TV_result = NNRE(x_real, x_syn)\n",
    "        end_time = time.time()    # Record the end time\n",
    "        time_NNR[k] = end_time - start_time\n",
    "        TV_est_NNR_vec[k] = NNR_TV_result\n",
    "\n",
    "        ## KNN estimator\n",
    "        start_time = time.time()    # Record the end time\n",
    "        KNN_TV_result = KNN(x_real, x_syn)\n",
    "        end_time = time.time()    # Record the end time\n",
    "        time_KNN[k] = end_time - start_time\n",
    "        TV_est_KNN_vec[k] = KNN_TV_result\n",
    "\n",
    "\n",
    "    TV_est_base_mat[i,:] = TV_est_base_vec\n",
    "    TV_est_CL_mat[i,:] = TV_est_CL_vec\n",
    "    TV_est_MC_mat[i,:] = TV_est_MC_vec\n",
    "    TV_est_KDE_mat[i,:] = TV_est_KDE_vec\n",
    "    TV_est_NNR_mat[i,:] = TV_est_NNR_vec\n",
    "    TV_est_KNN_mat[i,:] = TV_est_KNN_vec\n",
    "\n",
    "    \n",
    "    print(\"Total valuation (TV_est_base) is : \",TV_est_base_vec)\n",
    "    print(\"Total valuation (TV_est_CL) is : \", TV_est_CL_vec)\n",
    "    print(\"Total valuation (TV_est_MC) is : \", TV_est_MC_vec)\n",
    "    print(\"Total valuation (TV_est_KDE) is : \", TV_est_KDE_vec)\n",
    "    print(\"Total valuation (TV_est_NNR) is : \", TV_est_NNR_vec)\n",
    "    print(\"Total valuation (TV_est_KNN) is : \", TV_est_KNN_vec)\n",
    "\n",
    "\n",
    "TV_est_list = [TV_est_base_mat, TV_est_CL_mat, TV_est_MC_mat, TV_est_KDE_mat, TV_est_NNR_mat, TV_est_KNN_mat]"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
