{
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "Requirements"
      ],
      "metadata": {
        "id": "RPL7ez9PhWw_"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install pot\n",
        "!pip install numpy\n",
        "!pip install matplotlib\n",
        "!pip install scipy"
      ],
      "metadata": {
        "id": "VumITZU2FzG6"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Imports"
      ],
      "metadata": {
        "id": "1mP_hvbpceCB"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "3CuDTncbiFVA"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "from scipy.spatial import KDTree\n",
        "import matplotlib.pyplot as plt\n",
        "from scipy.spatial.distance import cdist\n",
        "import scipy.stats as stats\n",
        "from collections import defaultdict\n",
        "import matplotlib.ticker as ticker\n",
        "import ot\n",
        "from scipy.stats import sem"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "uw_D99uJesW0"
      },
      "source": [
        "# Utils"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "4VdHVT86iOZ6"
      },
      "outputs": [],
      "source": [
        "def get_gaussian_mechanism_scale(epsilon, delta, sensitivity):\n",
        "    \"\"\"\n",
        "    Computes the minimum noise scale (standard deviation) for the Gaussian mechanism\n",
        "    that satisfies (epsilon, delta)-differential privacy, using the tight bound\n",
        "    from Theorem 8 in Balle & Wang (2018).\n",
        "\n",
        "    Parameters:\n",
        "        epsilon (float): Privacy parameter ε\n",
        "        delta (float): Privacy parameter δ\n",
        "        sensitivity (float): ℓ2-sensitivity of the query/function\n",
        "\n",
        "    Returns:\n",
        "        float: Minimum noise scale σ ensuring (ε, δ)-DP\n",
        "    \"\"\"\n",
        "\n",
        "    # Initialize binary search bounds for the noise scale σ\n",
        "    upper_bound = 100 * sensitivity * np.sqrt(2 * np.log(1.25 / delta)) / epsilon\n",
        "    lower_bound = 0\n",
        "\n",
        "    # Perform binary search to find the smallest σ such that the analytic δ ≤ input δ\n",
        "    for _ in range(50):\n",
        "        candidate_scale = (upper_bound + lower_bound) / 2\n",
        "\n",
        "        # Compute the analytic δ as in Theorem 8 of Balle & Wang (2018)\n",
        "        left = stats.norm.cdf((sensitivity / (2 * candidate_scale)) - epsilon * candidate_scale / sensitivity)\n",
        "        right = stats.norm.cdf((-sensitivity / (2 * candidate_scale)) - epsilon * candidate_scale / sensitivity)\n",
        "        analytic_delta = left - np.exp(epsilon) * right\n",
        "\n",
        "        if analytic_delta <= delta:\n",
        "            # Candidate is a valid upper bound, try smaller σ\n",
        "            upper_bound = candidate_scale\n",
        "        else:\n",
        "            # Candidate is too small, increase σ\n",
        "            lower_bound = candidate_scale\n",
        "\n",
        "        scale = candidate_scale  # Update latest valid candidate\n",
        "\n",
        "    return scale\n",
        "\n",
        "def W1(p1, p2, xs, xt):\n",
        "    \"\"\"\n",
        "    Computes the 1-Wasserstein distance (Earth Mover's Distance) between two\n",
        "    probability distributions `p1` and `p2` supported on `xs` and `xt`, respectively.\n",
        "\n",
        "    Args:\n",
        "        p1 (np.ndarray): Source distribution, a probability vector summing to 1.\n",
        "        p2 (np.ndarray): Target distribution, a probability vector summing to 1.\n",
        "        xs (np.ndarray): Support points for `p1`, shape (n, d) for some n.\n",
        "        xt (np.ndarray): Support points for `p2`, shape (m, d) for some m.\n",
        "\n",
        "    Returns:\n",
        "        float: The 1-Wasserstein distance between `p1` and `p2` using the Euclidean ground cost.\n",
        "    \"\"\"\n",
        "    # Compute the cost matrix M using Euclidean distances between support points\n",
        "    M = cdist(xs, xt, metric='euclidean')  # shape (n, m)\n",
        "\n",
        "    # Use the Earth Mover's Distance (EMD) solver from POT to compute W1 distance\n",
        "    return ot.emd2(p1, p2, M)\n",
        "\n",
        "\n",
        "def unique_vector_proportions(A):\n",
        "    \"\"\"\n",
        "    Returns unique vectors and their proportions in set A.\n",
        "\n",
        "    Args:\n",
        "        A: numpy array of shape (n, d) - set of n vectors in d-dimensional space\n",
        "\n",
        "    Returns:\n",
        "        tuple: (unique_vectors, proportions)\n",
        "               where:\n",
        "               - unique_vectors: array of unique vectors (m x d)\n",
        "               - proportions: array of corresponding proportions (length m)\n",
        "    \"\"\"\n",
        "    # Convert to numpy array if not already\n",
        "    A = np.asarray(A)\n",
        "\n",
        "    # Use dictionary to count occurrences while handling floating-point comparisons\n",
        "    counts = defaultdict(int)\n",
        "    tol = 1e-10  # Tolerance for floating-point comparison\n",
        "\n",
        "    for vec in A:\n",
        "        # Round to handle numerical precision issues\n",
        "        key = tuple(np.round(vec, decimals=10))\n",
        "        counts[key] += 1\n",
        "\n",
        "    # Extract unique vectors and counts\n",
        "    unique_vectors = np.array([np.array(k) for k in counts.keys()])\n",
        "    counts = np.array(list(counts.values()))\n",
        "\n",
        "    # Calculate proportions\n",
        "    proportions = counts / len(A)\n",
        "\n",
        "    return unique_vectors, proportions\n",
        "\n",
        "def random_point_in_ball(d):\n",
        "    \"\"\"\n",
        "    Generates a single point uniformly at random inside the d-dimensional unit ball\n",
        "    centered at the origin, using rejection sampling.\n",
        "\n",
        "    Args:\n",
        "        d (int): Dimension of the space.\n",
        "\n",
        "    Returns:\n",
        "        np.ndarray: A d-dimensional point with ℓ2-norm at most 1.\n",
        "    \"\"\"\n",
        "    while True:\n",
        "        # Sample uniformly from the cube [-1, 1]^d\n",
        "        p = np.random.uniform(-1, 1, d)\n",
        "\n",
        "        # Accept the point only if it lies inside the unit ball\n",
        "        if np.linalg.norm(p) <= 1:\n",
        "            return p\n",
        "\n",
        "\n",
        "def normalize_data(data, R=1):\n",
        "    \"\"\"\n",
        "    Projects each data point onto the ℓ2 ball of radius `R` (default 1),\n",
        "    ensuring that all rows have norm at most `R`.\n",
        "\n",
        "    Args:\n",
        "        data (np.ndarray): Array of shape (n, d), where each row is a d-dimensional data point.\n",
        "        R (float, optional): Maximum allowed norm. Points with norm > R are rescaled. Default is 1.\n",
        "\n",
        "    Returns:\n",
        "        np.ndarray: Array of shape (n, d) with all rows having ℓ2-norm ≤ R.\n",
        "    \"\"\"\n",
        "    # Compute ℓ2-norms of each row (data point)\n",
        "    norms = np.linalg.norm(data, axis=1, keepdims=True)\n",
        "\n",
        "    # Compute scaling factors: 1 if norm ≤ R, else R / norm (avoid divide-by-zero)\n",
        "    scale = np.minimum(1, R / np.maximum(norms, 1e-10))\n",
        "\n",
        "    # Scale each data point accordingly\n",
        "    data_normalized = data * scale\n",
        "\n",
        "    return data_normalized"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "6tWGMmcbV7Mi"
      },
      "source": [
        "# Histograms"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "8ZoI9ZnciYY_"
      },
      "outputs": [],
      "source": [
        "def non_private_histogram(A, B, batch_size=10000):\n",
        "    \"\"\"\n",
        "    Computes a non-private histogram of dataset `A` over the discrete support `B`\n",
        "    by assigning each point in `A` to its nearest neighbor in `B`.\n",
        "\n",
        "    Args:\n",
        "        A (np.ndarray): Array of shape (n, d), the input data points.\n",
        "        B (np.ndarray): Array of shape (m, d), the discrete support (e.g., histogram bins).\n",
        "        batch_size (int, optional): Number of points to process per batch. Default is 10,000.\n",
        "\n",
        "    Returns:\n",
        "        np.ndarray: A length-m probability vector representing the normalized histogram over `B`.\n",
        "    \"\"\"\n",
        "    # Build a KDTree for efficient nearest-neighbor queries on support points\n",
        "    tree = KDTree(B)\n",
        "\n",
        "    # Initialize count vector\n",
        "    counts = np.zeros(len(B))\n",
        "\n",
        "    # Process data in batches to reduce memory usage\n",
        "    for i in range(0, len(A), batch_size):\n",
        "        batch = A[i:i + batch_size]\n",
        "\n",
        "        # Query nearest support point in B for each point in the batch\n",
        "        _, indices = tree.query(batch, k=1)\n",
        "\n",
        "        # Update histogram counts using bincount\n",
        "        counts += np.bincount(indices, minlength=len(B))\n",
        "\n",
        "    # Normalize counts to form a probability distribution\n",
        "    return counts / len(A)\n",
        "\n",
        "\n",
        "def noisy_histogram_gaussian(histogram, scale):\n",
        "    \"\"\"\n",
        "    Adds Gaussian noise to a histogram and normalizes the result\n",
        "    to obtain a valid probability distribution.\n",
        "\n",
        "    Args:\n",
        "        histogram (np.ndarray): A length-m histogram representing counts or probabilities.\n",
        "        scale (float): Standard deviation of the Gaussian noise added to each bin.\n",
        "\n",
        "    Returns:\n",
        "        np.ndarray: A length-m probability vector after noise addition, thresholding, and normalization.\n",
        "    \"\"\"\n",
        "    # Add independent Gaussian noise to each histogram bin\n",
        "    noisy_histogram = histogram + np.random.normal(loc=0, scale=scale, size=len(histogram))\n",
        "\n",
        "    # Ensure non-negativity by thresholding at zero\n",
        "    thresholded = np.maximum(noisy_histogram, 0)\n",
        "\n",
        "    # Normalize to make the result a valid probability distribution\n",
        "    return thresholded / np.sum(thresholded)\n",
        "\n",
        "def Laplace_histogram(histogram, n, epsilon, delta):\n",
        "    \"\"\"\n",
        "    Applies the Laplace mechanism with thresholding to a histogram to ensure\n",
        "    (ε, δ)-differential privacy, following standard postprocessing heuristics.\n",
        "\n",
        "    Args:\n",
        "        histogram (np.ndarray): A length-m probability vector (summing to 1).\n",
        "        n (int): Number of samples used to generate the histogram (i.e., histogram = counts / n).\n",
        "        epsilon (float): Privacy parameter ε.\n",
        "        delta (float): Privacy parameter δ.\n",
        "\n",
        "    Returns:\n",
        "        np.ndarray: A length-m probability vector after adding Laplace noise, thresholding,\n",
        "                    rounding, and renormalization. Returns all zeros if total mass is zero.\n",
        "    \"\"\"\n",
        "    # Convert histogram to integer counts\n",
        "    counts = n * histogram\n",
        "\n",
        "    # Set Laplace scale\n",
        "    scale = 2 / epsilon\n",
        "\n",
        "    # Initialize noisy counts\n",
        "    noisy_counts = np.zeros(len(counts))\n",
        "\n",
        "    # Add noise only to nonzero entries and apply a threshold for (epsilon,δ)-approximate DP\n",
        "    for i in range(len(counts)):\n",
        "        if histogram[i] > 0:\n",
        "            noisy_counts[i] = counts[i] + np.random.laplace(loc=0, scale=scale)\n",
        "\n",
        "            # Thresholding to remove low-count bins (ensures approximate DP)\n",
        "            if noisy_counts[i] < 2 * np.log(2 / delta) / epsilon + 1:\n",
        "                noisy_counts[i] = 0\n",
        "\n",
        "    # Round and cast to integers to preserve count semantics\n",
        "    noisy_counts = np.round(noisy_counts).astype(int)\n",
        "\n",
        "    # Normalize if there's any remaining mass\n",
        "    if noisy_counts.sum() > 0:\n",
        "        return noisy_counts / noisy_counts.sum()\n",
        "\n",
        "    # Return zero vector if all bins were removed\n",
        "    return noisy_counts"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# APIs"
      ],
      "metadata": {
        "id": "2Hf-8iWvOZgu"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Random_API"
      ],
      "metadata": {
        "id": "vk8vzsXIOiMh"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def points_in_unit_ball(n, d, b=1):\n",
        "    \"\"\"\n",
        "    Generates `n` points uniformly distributed inside the d-dimensional ball\n",
        "    of radius `b`, centered at the origin.\n",
        "\n",
        "    Args:\n",
        "        n (int): Number of points to generate.\n",
        "        d (int): Dimension of the space.\n",
        "        b (float, optional): Radius of the ball. Default is 1.\n",
        "\n",
        "    Returns:\n",
        "        np.ndarray: Array of shape (n, d) where each row is a point uniformly\n",
        "                    sampled from the d-dimensional ball of radius `b`.\n",
        "    \"\"\"\n",
        "    # Step 1: Sample directions uniformly on the surface of the d-dimensional unit sphere\n",
        "    points = np.random.normal(size=(n, d))                # Gaussian samples\n",
        "    norms = np.linalg.norm(points, axis=1, keepdims=True) # Compute Euclidean norms\n",
        "    directions = points / norms                           # Normalize to unit vectors\n",
        "\n",
        "    # Step 2: Sample radii using the distribution r^{d-1} to ensure uniformity in volume\n",
        "    radii = np.random.uniform(size=n) ** (1 / d) * b      # Scale to radius b\n",
        "\n",
        "    # Step 3: Return scaled points within the ball\n",
        "    return directions * radii.reshape(-1, 1)"
      ],
      "metadata": {
        "id": "7-4a9EIYOhru"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Variation_API"
      ],
      "metadata": {
        "id": "-zgM7ky1Oa7a"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def vars_fixed_level_of_noise(original_data, k, scale):\n",
        "    n, d = original_data.shape\n",
        "\n",
        "    # Stack k copies of the original data vertically\n",
        "    stacked_data = np.tile(original_data, (k, 1))\n",
        "    noise = np.random.normal(loc=0, scale=scale, size=(n * k, d))\n",
        "    perturbed_data = stacked_data + noise\n",
        "\n",
        "    return perturbed_data\n",
        "\n",
        "def vars(data, diameter, alpha, num_vars = 2):\n",
        "\n",
        "    d = data.shape[1]\n",
        "    levels = int(np.ceil(np.log2(diameter/alpha)))\n",
        "    variations = data\n",
        "    for level in range(1,levels+1):\n",
        "      scale = alpha*2**(level-1)/(np.sqrt(np.pi)*((np.sqrt(d) + np.log(2))**2 + np.log(2)))\n",
        "      level_vars = vars_fixed_level_of_noise(data, num_vars, scale)\n",
        "      variations = np.concatenate([variations, level_vars])\n",
        "\n",
        "    return normalize_data(variations)"
      ],
      "metadata": {
        "id": "DGAk6XjLOb65"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fUgCfMtdU5Dp"
      },
      "source": [
        "# Algorithms"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0WPk80yfV3Ub"
      },
      "source": [
        "We start with code for Private evolution (Algorithm 2) that returns the final accuracy in 1-Wasserstein"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "LMvdCslGii6b"
      },
      "outputs": [],
      "source": [
        "def PE(data, diameter, epsilon, delta, initial_data=None, num_vars=2):\n",
        "    \"\"\"\n",
        "    Runs the Private Evolution (PE) algorithm to generate synthetic data and\n",
        "    computes the 1-Wasserstein distance between the real and synthetic data distributions.\n",
        "\n",
        "    Args:\n",
        "        data (np.ndarray): Input sensitive dataset of shape (n, d).\n",
        "        diameter (float): Diameter of the data domain (used to scale noise).\n",
        "        epsilon (float): Differential privacy parameter ε.\n",
        "        delta (float): Differential privacy parameter δ.\n",
        "        initial_data (np.ndarray, optional): Optional initialization for the synthetic data.\n",
        "        num_vars (int, optional): Number of noisy variations generated per point per level.\n",
        "\n",
        "    Returns:\n",
        "        float: 1-Wasserstein distance between the real and synthetic distributions.\n",
        "    \"\"\"\n",
        "    n, d = data.shape\n",
        "\n",
        "    # Set number of PE iterations based on data size and privacy level\n",
        "    T = 2 * int(np.log(n * epsilon)) + 1\n",
        "\n",
        "    # Compute the Gaussian noise scale for the overall mechanism\n",
        "    scale = get_gaussian_mechanism_scale(epsilon, delta, np.sqrt(2 * T) / n)\n",
        "\n",
        "    # Determine alpha resolution from noise scale\n",
        "    alpha = diameter * scale ** (1 / np.max([2, d]))\n",
        "\n",
        "    # Estimate total number of synthetic variations needed\n",
        "    noise_levels = int(np.ceil(np.log2(diameter / alpha)))\n",
        "    total_vars = noise_levels * num_vars + 1\n",
        "    n_s = int(total_vars ** (1 / np.max([d, 2])-1) * scale ** (-1)) + 1\n",
        "\n",
        "    # Initialize synthetic data either randomly or from a provided dataset\n",
        "    if initial_data is None:\n",
        "        synthetic_data = points_in_unit_ball(n, d)\n",
        "    else:\n",
        "        synthetic_data = initial_data[np.random.choice(initial_data.shape[0], size=n_s)]\n",
        "\n",
        "    # Iteratively improve synthetic data using private updates\n",
        "    for _ in range(1, T + 1):\n",
        "        # Generate noisy variations of synthetic data\n",
        "        synthetic_data = vars(synthetic_data, diameter, alpha, num_vars)\n",
        "\n",
        "        # Construct nearest-neighbor histogram from real data over synthetic support\n",
        "        histogram = non_private_histogram(data, synthetic_data)\n",
        "\n",
        "        # Apply Gaussian noise and renormalize\n",
        "        noisy_histogram = noisy_histogram_gaussian(histogram, scale)\n",
        "\n",
        "        # Resample synthetic data from noisy histogram\n",
        "        synthetic_data = synthetic_data[np.random.choice(\n",
        "            synthetic_data.shape[0], size=n_s, p=noisy_histogram\n",
        "        )]\n",
        "\n",
        "    # Compute and return Wasserstein distance between real and synthetic histograms\n",
        "    unique_vectors, proportions = unique_vector_proportions(data)\n",
        "    unique_synthetic_vectors, synthetic_proportions = unique_vector_proportions(synthetic_data)\n",
        "\n",
        "    return W1(proportions, synthetic_proportions, unique_vectors, unique_synthetic_vectors)"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Below, we provide the code for Private evolution (Algorithm 2), which returns the accuracy in 1-Wasserstein at each iteration"
      ],
      "metadata": {
        "id": "rAthounIPtNZ"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def PE_trajectory(data, diameter, epsilon, delta, initial_data=None, num_vars=2):\n",
        "    \"\"\"\n",
        "    Runs the Private Evolution (PE) algorithm and records the 1-Wasserstein distance\n",
        "    between the real and synthetic data distributions at each iteration.\n",
        "\n",
        "    Args:\n",
        "        data (np.ndarray): Sensitive input dataset of shape (n, d).\n",
        "        diameter (float): Diameter of the data domain (used for setting noise scale).\n",
        "        epsilon (float): Differential privacy parameter ε.\n",
        "        delta (float): Differential privacy parameter δ.\n",
        "        initial_data (np.ndarray, optional): Optional initial synthetic dataset.\n",
        "        num_vars (int, optional): Number of noisy variations per point per level.\n",
        "\n",
        "    Returns:\n",
        "        np.ndarray: Array of length T+1 containing the Wasserstein distance trajectory\n",
        "                    over all iterations (including initialization).\n",
        "    \"\"\"\n",
        "    n, d = data.shape\n",
        "\n",
        "    # Number of PE iterations\n",
        "    T = 2 * int(np.log(n * epsilon)) + 1\n",
        "\n",
        "    # Gaussian noise scale for differential privacy\n",
        "    scale = get_gaussian_mechanism_scale(epsilon, delta, np.sqrt(2 * T) / n)\n",
        "\n",
        "    # Minimum resolution for synthetic variations\n",
        "    alpha = diameter * scale ** (1 / np.max([2, d]))\n",
        "\n",
        "    # Determine number of variation levels and required sample size\n",
        "    noise_levels = int(np.ceil(np.log2(diameter / alpha)))\n",
        "    total_vars = noise_levels * num_vars + 1\n",
        "    n_s = int(total_vars ** (1 / np.max([d, 2]) - 1) * scale ** (-1)) + 1\n",
        "\n",
        "    # Precompute histogram of real data\n",
        "    unique_vectors, proportions = unique_vector_proportions(data)\n",
        "\n",
        "    # Initialize synthetic data\n",
        "    if initial_data is None:\n",
        "        synthetic_data = points_in_unit_ball(n_s, d)\n",
        "    else:\n",
        "        synthetic_data = initial_data[np.random.choice(initial_data.shape[0], size=n_s)]\n",
        "\n",
        "    # Initial Wasserstein distance\n",
        "    unique_synthetic_vectors, synthetic_proportions = unique_vector_proportions(synthetic_data)\n",
        "    trajectory = [W1(proportions, synthetic_proportions, unique_vectors, unique_synthetic_vectors)]\n",
        "\n",
        "    # Iterate and update synthetic data while recording distance\n",
        "    for _ in range(1, T + 1):\n",
        "        synthetic_data = vars(synthetic_data, diameter, alpha, num_vars)\n",
        "        histogram = non_private_histogram(data, synthetic_data)\n",
        "        noisy_histogram = noisy_histogram_gaussian(histogram, scale)\n",
        "        synthetic_data = synthetic_data[np.random.choice(\n",
        "            synthetic_data.shape[0], size=n_s, p=noisy_histogram\n",
        "        )]\n",
        "        unique_synthetic_vectors, synthetic_proportions = unique_vector_proportions(synthetic_data)\n",
        "        trajectory.append(W1(proportions, synthetic_proportions, unique_vectors, unique_synthetic_vectors))\n",
        "\n",
        "    return np.array(trajectory)"
      ],
      "metadata": {
        "id": "RfYQh0l1PhME"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Next, we present a variant of Private Evolution (algorithm 2) that uses our theory-predicted parameter setting for all parameters, except T"
      ],
      "metadata": {
        "id": "adLxJzJkXVEy"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "mP6ESExTq1Tv"
      },
      "outputs": [],
      "source": [
        "def suboptimalTPE_last_iterate(T, data, diameter, epsilon, delta, initial_data=None, num_vars=2):\n",
        "    \"\"\"\n",
        "    Runs T iterations of a suboptimal variant of the Private Evolution algorithm and\n",
        "    returns the final Wasserstein distance between real and synthetic distributions.\n",
        "\n",
        "    Args:\n",
        "        T (int): Number of iterations.\n",
        "        data (np.ndarray): Input dataset of shape (n, d).\n",
        "        diameter (float): Diameter of the data domain.\n",
        "        epsilon (float): DP parameter ε.\n",
        "        delta (float): DP parameter δ.\n",
        "        initial_data (np.ndarray, optional): Optional initial synthetic dataset.\n",
        "        num_vars (int): Number of noisy variations per level.\n",
        "\n",
        "    Returns:\n",
        "        float: Final Wasserstein distance between true and synthetic distributions.\n",
        "    \"\"\"\n",
        "    n, d = data.shape\n",
        "\n",
        "    # Compute DP noise scale and resolution\n",
        "    scale = get_gaussian_mechanism_scale(epsilon, delta, np.sqrt(2 * T) / n)\n",
        "    alpha = diameter * scale**(1 / max(2, d))\n",
        "\n",
        "    # Estimate number of total variations and required synthetic samples\n",
        "    noise_levels = int(np.ceil(np.log2(diameter / alpha)))\n",
        "    total_vars = noise_levels * num_vars + 1\n",
        "    n_s = int(total_vars**(1 / max(2, d)-1) * scale**-1) + 1\n",
        "\n",
        "    # Compute histogram of original data\n",
        "    unique_vectors, proportions = unique_vector_proportions(data)\n",
        "\n",
        "    # Initialize synthetic data\n",
        "    if initial_data is None:\n",
        "        synthetic_data = points_in_unit_ball(n_s, d)\n",
        "    else:\n",
        "        synthetic_data = initial_data[np.random.choice(initial_data.shape[0], size=n_s)]\n",
        "\n",
        "    # Run T iterations of noisy updates\n",
        "    for i in range(1, T + 1):\n",
        "        synthetic_data = normalize_data(vars(synthetic_data, diameter, alpha, num_vars))\n",
        "        histogram = non_private_histogram(data, synthetic_data)\n",
        "        noisy_histogram = noisy_histogram_gaussian(histogram, scale)\n",
        "        synthetic_data = synthetic_data[np.random.choice(\n",
        "            synthetic_data.shape[0], size=n_s, p=noisy_histogram\n",
        "        )]\n",
        "\n",
        "    # Compute and return final W1 error\n",
        "    unique_synthetic_vectors, synthetic_proportions = unique_vector_proportions(synthetic_data)\n",
        "    return W1(proportions, synthetic_proportions, unique_vectors, unique_synthetic_vectors)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Now, we present a variant of Private Evolution (algorithm 2) that uses our theory-predicted parameter setting for all parameters, except n_s"
      ],
      "metadata": {
        "id": "JJm2kx-iXJUY"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def suboptimal_ns_last_iterate(n_s, data, diameter, epsilon, delta, initial_data=None, num_vars=2):\n",
        "    \"\"\"\n",
        "    Runs a suboptimal variant of the Private Evolution algorithm using a fixed number\n",
        "    of synthetic samples `n_s` and returns the final 1-Wasserstein error and a computed\n",
        "    theoretical estimate for `n_s`.\n",
        "\n",
        "    Args:\n",
        "        n_s (int): Number of synthetic samples used at each iteration.\n",
        "        data (np.ndarray): Real dataset of shape (n, d).\n",
        "        diameter (float): Diameter of the data domain.\n",
        "        epsilon (float): DP parameter ε.\n",
        "        delta (float): DP parameter δ.\n",
        "        initial_data (np.ndarray, optional): Optional initialization for synthetic data.\n",
        "        num_vars (int, optional): Number of variations per data point per level.\n",
        "\n",
        "    Returns:\n",
        "        tuple:\n",
        "            - float: Final 1-Wasserstein distance between real and synthetic distributions.\n",
        "            - int: Theoretical recommendation for `n_s`.\n",
        "    \"\"\"\n",
        "    n, d = data.shape\n",
        "\n",
        "    # Number of PE iterations\n",
        "    T = 2 * int(np.log(n * epsilon)) + 1\n",
        "\n",
        "    # DP noise scale and resolution\n",
        "    scale = get_gaussian_mechanism_scale(epsilon, delta, np.sqrt(2 * T) / n)\n",
        "    alpha = diameter * scale ** (1 / max(d, 2))\n",
        "\n",
        "    # Number of noise levels and total variations per iteration\n",
        "    noise_levels = int(np.ceil(np.log2(diameter / alpha)))\n",
        "    total_vars = noise_levels * num_vars + 1\n",
        "\n",
        "    # Compute histogram of original data\n",
        "    unique_vectors, proportions = unique_vector_proportions(data)\n",
        "\n",
        "    # Initialize synthetic data\n",
        "    if initial_data is None:\n",
        "        synthetic_data = points_in_unit_ball(n_s, d)\n",
        "    else:\n",
        "        synthetic_data = initial_data[np.random.choice(initial_data.shape[0], size=n_s)]\n",
        "\n",
        "    # Iteratively update synthetic data\n",
        "    for i in range(1, T + 1):\n",
        "        synthetic_data = normalize_data(vars(synthetic_data, diameter, alpha, num_vars))\n",
        "        histogram = non_private_histogram(data, synthetic_data)\n",
        "        noisy_histogram = noisy_histogram_gaussian(histogram, scale)\n",
        "        synthetic_data = synthetic_data[np.random.choice(\n",
        "            synthetic_data.shape[0], size=n_s, p=noisy_histogram\n",
        "        )]\n",
        "\n",
        "    # Compute final Wasserstein error\n",
        "    unique_synthetic_vectors, synthetic_proportions = unique_vector_proportions(synthetic_data)\n",
        "    final_error = W1(proportions, synthetic_proportions, unique_vectors, unique_synthetic_vectors)\n",
        "\n",
        "    # Theoretical recommendation for n_s\n",
        "    pred_n_s = int(total_vars ** (1 / max(d, 2) - 1) * scale**-1) + 1\n",
        "\n",
        "    return final_error, pred_n_s\n"
      ],
      "metadata": {
        "id": "c6REjOI4ipHK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Finally, we provide the variation of Private evolution (Algorithm 2) with Laplace+thresholding histogram from Section 4.2 in the main body"
      ],
      "metadata": {
        "id": "qevY41gC9GNQ"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def PE_Laplace(data, diameter, epsilon, delta, initial_data=None, num_vars=2):\n",
        "    \"\"\"\n",
        "    Runs the Private Evolution (PE) algorithm using the Laplace mechanism\n",
        "    instead of the Gaussian mechanism, and returns the 1-Wasserstein distance\n",
        "    between the real and synthetic data distributions.\n",
        "\n",
        "    Args:\n",
        "        data (np.ndarray): Input sensitive dataset of shape (n, d).\n",
        "        diameter (float): Diameter of the domain (used to set noise scales).\n",
        "        epsilon (float): Total privacy budget ε.\n",
        "        delta (float): Total privacy parameter δ.\n",
        "        initial_data (np.ndarray, optional): Optional initialization for synthetic data.\n",
        "        num_vars (int, optional): Number of noisy variations generated per point per level.\n",
        "\n",
        "    Returns:\n",
        "        float: 1-Wasserstein distance between real and synthetic distributions.\n",
        "    \"\"\"\n",
        "    n, d = data.shape\n",
        "\n",
        "    # Number of PE iterations; tied to privacy and dataset size\n",
        "    T = 2 * int(np.log(n * epsilon)) + 1\n",
        "\n",
        "    # Set the Gaussian scale for computing α (even though Laplace is used later)\n",
        "    scale = get_gaussian_mechanism_scale(epsilon, delta, np.sqrt(2 * T) / n)\n",
        "    alpha = diameter * scale ** (1 / np.max([2, d]))\n",
        "\n",
        "    # Compute number of noise levels and total variations per PE step\n",
        "    noise_levels = int(np.ceil(np.log2(diameter / alpha)))\n",
        "    total_vars = noise_levels * num_vars + 1\n",
        "    n_s = int(total_vars ** (1 / np.max([d, 2])-1) * scale ** (-1)) + 1\n",
        "\n",
        "    # Initialize synthetic data\n",
        "    if initial_data is None:\n",
        "        synthetic_data = points_in_unit_ball(n, d)\n",
        "    else:\n",
        "        synthetic_data = initial_data[np.random.choice(initial_data.shape[0], size=n_s)]\n",
        "\n",
        "    # Perform PE iterations\n",
        "    for _ in range(1, T + 1):\n",
        "        previous_synthetic_data = synthetic_data.copy()\n",
        "\n",
        "        # Generate and normalize noisy variations\n",
        "        synthetic_data = normalize_data(vars(synthetic_data, diameter, alpha, num_vars))\n",
        "\n",
        "        # Estimate real histogram over current synthetic support\n",
        "        histogram = non_private_histogram(data, synthetic_data)\n",
        "\n",
        "        # Apply Laplace mechanism with per-iteration privacy budget\n",
        "        noisy_histogram = Laplace_histogram(histogram, n, epsilon / T, delta / T)\n",
        "\n",
        "        # If histogram has mass, resample synthetic data accordingly\n",
        "        if noisy_histogram.sum() > 0:\n",
        "            synthetic_data = synthetic_data[np.random.choice(\n",
        "                synthetic_data.shape[0], size=n_s, p=noisy_histogram\n",
        "            )]\n",
        "        else:\n",
        "            # If histogram is empty (e.g., all mass thresholded away), revert\n",
        "            synthetic_data = previous_synthetic_data\n",
        "\n",
        "    # Compute final 1-Wasserstein distance between data and synthetic distributions\n",
        "    unique_vectors, proportions = unique_vector_proportions(data)\n",
        "    unique_synthetic_vectors, synthetic_proportions = unique_vector_proportions(synthetic_data)\n",
        "\n",
        "    return W1(proportions, synthetic_proportions, unique_vectors, unique_synthetic_vectors)"
      ],
      "metadata": {
        "id": "iROv6uRZ9Iz7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "a1gQGy9Ubm7e"
      },
      "source": [
        "# Paper figures"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "plt.rcParams.update({\n",
        "    \"font.size\": 18,          # base font size\n",
        "    \"axes.titlesize\": 20,     # title font\n",
        "    \"axes.labelsize\": 20,     # x and y axis labels\n",
        "    \"xtick.labelsize\": 16,    # tick labels\n",
        "    \"ytick.labelsize\": 16,\n",
        "    \"legend.fontsize\": 18     # legend font\n",
        "})"
      ],
      "metadata": {
        "id": "Q7MkLtMNQa_M"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nRva7mKBbqCH"
      },
      "source": [
        "## Effect of initialization"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "We run algorithm PE_trajectory from Section 'Algorithms' starting from different initializations"
      ],
      "metadata": {
        "id": "lilYYLhXXhKr"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Experiment parameters\n",
        "d = 2\n",
        "epsilon = 1\n",
        "delta = 10**-4\n",
        "diameter = 2\n",
        "n = 1000\n",
        "\n",
        "repetitions = 100\n",
        "\n",
        "fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)\n",
        "\n",
        "### Plot 1: identical x and y (good initialization)\n",
        "all_errors = []\n",
        "for i in range(1, repetitions + 1):\n",
        "    x = np.abs(points_in_unit_ball(n, d))\n",
        "    err = PE_trajectory(x, diameter, epsilon, delta, x)\n",
        "    all_errors.append(err)\n",
        "\n",
        "all_errors = np.array(all_errors)\n",
        "mean_err = np.mean(all_errors, axis=0)\n",
        "std_err = np.std(all_errors, axis=0, ddof=1) / np.sqrt(repetitions)\n",
        "ci = 1.96 * std_err\n",
        "\n",
        "steps = np.arange(mean_err.shape[0])\n",
        "axes[0].plot(steps, mean_err, label=f'PE ({epsilon}, {delta})-DP')\n",
        "axes[0].fill_between(steps, mean_err - ci, mean_err + ci, alpha=0.3)\n",
        "axes[0].set_title('Good initialization')\n",
        "axes[0].set_xlabel('Steps')\n",
        "axes[0].set_ylabel('Error in Wasserstein 1')\n",
        "axes[0].legend()\n",
        "axes[0].tick_params(axis='both')\n",
        "axes[0].xaxis.set_major_locator(ticker.MaxNLocator(integer=True))\n",
        "\n",
        "### Plot 2: opposite x and y (bad initialization)\n",
        "all_errors = []\n",
        "for i in range(1, repetitions + 1):\n",
        "    x = np.abs(points_in_unit_ball(n, d))\n",
        "    y = -(2 * 0.5 - 1) * x\n",
        "    err = PE_trajectory(x, diameter, epsilon, delta, y)\n",
        "    all_errors.append(err)\n",
        "\n",
        "all_errors = np.array(all_errors)\n",
        "mean_err = np.mean(all_errors, axis=0)\n",
        "std_err = np.std(all_errors, axis=0, ddof=1) / np.sqrt(repetitions)\n",
        "ci = 1.96 * std_err\n",
        "\n",
        "steps = np.arange(mean_err.shape[0])\n",
        "axes[1].plot(steps, mean_err, label=f'PE ({epsilon}, {delta})-DP')\n",
        "axes[1].fill_between(steps, mean_err - ci, mean_err + ci, alpha=0.3)\n",
        "axes[1].set_title('Bad initialization')\n",
        "axes[1].set_xlabel('Steps')\n",
        "axes[1].legend()\n",
        "axes[1].tick_params(axis='both')\n",
        "axes[1].xaxis.set_major_locator(ticker.MaxNLocator(integer=True))\n",
        "\n",
        "### Plot 3: varying interpolations between x and y\n",
        "interpolation = np.arange(0, 0.6, 0.1)\n",
        "for gamma in interpolation:\n",
        "    all_errors = []\n",
        "    for i in range(1, repetitions + 1):\n",
        "        x = np.abs(points_in_unit_ball(n, d))\n",
        "        y = -(2 * gamma - 1) * x\n",
        "        err = PE_trajectory(x, diameter, epsilon, delta, y)\n",
        "        all_errors.append(err)\n",
        "\n",
        "    all_errors = np.array(all_errors)\n",
        "    mean_err = np.mean(all_errors, axis=0)\n",
        "    std_err = np.std(all_errors, axis=0, ddof=1) / np.sqrt(repetitions)\n",
        "    ci = 1.96 * std_err\n",
        "\n",
        "    steps = np.arange(mean_err.shape[0])\n",
        "    axes[2].plot(steps, mean_err, label=f'β: {gamma:.1f}')\n",
        "    axes[2].fill_between(steps, mean_err - ci, mean_err + ci, alpha=0.3)\n",
        "\n",
        "axes[2].set_title('Varying initialization')\n",
        "axes[2].set_xlabel('Steps')\n",
        "axes[2].legend()\n",
        "axes[2].tick_params(axis='both')\n",
        "axes[2].xaxis.set_major_locator(ticker.MaxNLocator(integer=True))\n",
        "\n",
        "plt.tight_layout()\n",
        "plt.show()"
      ],
      "metadata": {
        "id": "JNAcdqs5RE58"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "F6HJPXxIq0AC"
      },
      "source": [
        "## Suboptimal T"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "We run algorithm suboptimalTPE_last_iterate from Section Algorithms on different values of T"
      ],
      "metadata": {
        "id": "DefkfMj4XSdK"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Experiment parameters\n",
        "d = 2\n",
        "epsilon = 1\n",
        "delta = 1e-4\n",
        "diameter = 2\n",
        "steps = np.arange(5, 101, 5)\n",
        "sample_sizes = [500, 1000, 2000]\n",
        "\n",
        "repetitions = 100\n",
        "\n",
        "fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)\n",
        "\n",
        "for idx, n in enumerate(sample_sizes):\n",
        "    ax = axes[idx]\n",
        "    all_errors = []\n",
        "\n",
        "    for _ in range(repetitions):\n",
        "        data = np.abs(points_in_unit_ball(n, d))\n",
        "        errors_i = [\n",
        "            suboptimalTPE_last_iterate(t, data, diameter, epsilon, delta, initial_data=-data)\n",
        "            for t in steps\n",
        "        ]\n",
        "        all_errors.append(errors_i)\n",
        "\n",
        "    all_errors = np.array(all_errors)\n",
        "    mean_err = np.mean(all_errors, axis=0)\n",
        "    std_err = np.std(all_errors, axis=0, ddof=1) / np.sqrt(repetitions)\n",
        "    ci = 1.96 * std_err\n",
        "\n",
        "    ax.plot(steps, mean_err, label='PE with different T')\n",
        "    ax.fill_between(steps, mean_err - ci, mean_err + ci, alpha=0.3)\n",
        "\n",
        "    # Predicted T\n",
        "    T = 2 * int(np.log(n * epsilon)) + 1\n",
        "    ax.axvline(x=T, color='k', linestyle='--', label='Predicted T')\n",
        "    ax.set_title(f'n = {n}')\n",
        "    ax.set_xlabel('Number of steps')\n",
        "    ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))\n",
        "    ax.tick_params(axis='both')\n",
        "    if idx == 0:\n",
        "        ax.set_ylabel('Error in 1-Wasserstein')\n",
        "    ax.legend()\n",
        "\n",
        "fig.suptitle('Performance of Final Iterate of PE vs Number of Steps')\n",
        "plt.tight_layout(rect=[0, 0, 1, 0.95])\n",
        "plt.show()"
      ],
      "metadata": {
        "id": "2FpPYb74Q6Wr"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Suboptimal n_s"
      ],
      "metadata": {
        "id": "-H1JzUDfiDUG"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "We run algorithm suboptimal_ns_last_iterate from Section Algorithms on different values of n_s"
      ],
      "metadata": {
        "id": "22BW2-N0Ztuf"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#Experiment parameters\n",
        "d = 2\n",
        "epsilon = 1\n",
        "delta = 1e-4\n",
        "diameter = 2\n",
        "num_vars_list = np.arange(10, 201, 10)\n",
        "sample_sizes = [500, 1000, 2000]\n",
        "\n",
        "repetitions = 100\n",
        "\n",
        "fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)\n",
        "\n",
        "for idx, n in enumerate(sample_sizes):\n",
        "    ax = axes[idx]\n",
        "    all_errors = []\n",
        "\n",
        "    for _ in range(repetitions):\n",
        "        data = np.abs(points_in_unit_ball(n, d))\n",
        "        errors_i = [suboptimal_ns_last_iterate(n_s, data, diameter, epsilon, delta, initial_data=-data)[0]\n",
        "                    for n_s in num_vars_list]\n",
        "        all_errors.append(errors_i)\n",
        "\n",
        "    all_errors = np.array(all_errors)\n",
        "    mean_err = np.mean(all_errors, axis=0)\n",
        "    std_err = np.std(all_errors, axis=0, ddof=1) / np.sqrt(repetitions)\n",
        "    ci = 1.96 * std_err\n",
        "\n",
        "    ax.plot(num_vars_list, mean_err, label='PE with different n_s')\n",
        "    ax.fill_between(num_vars_list, mean_err - ci, mean_err + ci, alpha=0.3)\n",
        "\n",
        "    # Get predicted n_s from a single run\n",
        "    _, predicted_ns = suboptimal_ns_last_iterate(num_vars_list[0], data, diameter, epsilon, delta, initial_data=-data)\n",
        "\n",
        "    ax.axvline(x=predicted_ns, color='k', linestyle='--', label='Predicted n_s')\n",
        "    ax.set_title(f'n = {n}')\n",
        "    ax.set_xlabel('Number of synthetic data points')\n",
        "    ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))\n",
        "    ax.tick_params(axis='both')\n",
        "    if idx == 0:\n",
        "        ax.set_ylabel('Error in 1-Wasserstein')\n",
        "    ax.legend()\n",
        "\n",
        "fig.suptitle('Performance of Final Iterate of PE vs Number of Synthetic Data Points')\n",
        "plt.tight_layout(rect=[0, 0, 1, 0.95])\n",
        "plt.show()"
      ],
      "metadata": {
        "id": "_vLnwwVzZhCp"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Laplace + thresholding"
      ],
      "metadata": {
        "id": "73EF4RpZA0L6"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "We compare the Algorithms PE and PE_Laplace from the Section 'Algorithms' in two different cases"
      ],
      "metadata": {
        "id": "B2EZHPg2XYOM"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#Experiment parameters\n",
        "d = 2\n",
        "epsilon = 1\n",
        "delta = 10**-4\n",
        "diameter = 2\n",
        "n_values = np.arange(50, 2051, 200)\n",
        "\n",
        "repetitions = 100\n",
        "\n",
        "errors_PE = []\n",
        "errors_PE_HH = []\n",
        "errors_PE_std = []\n",
        "errors_PE_HH_std = []\n",
        "\n",
        "for n in n_values:\n",
        "    errs_pe = []\n",
        "    errs_pe_hh = []\n",
        "\n",
        "    for _ in range(repetitions):\n",
        "        data = points_in_unit_ball(n, d, 0.5) #data is in a big ball\n",
        "        errs_pe.append(PE(data, diameter, epsilon, delta))\n",
        "        errs_pe_hh.append(PE_Laplace(data, diameter, epsilon, delta))\n",
        "\n",
        "    errors_PE.append(np.mean(errs_pe))\n",
        "    errors_PE_HH.append(np.mean(errs_pe_hh))\n",
        "    errors_PE_std.append(1.96 * sem(errs_pe))\n",
        "    errors_PE_HH_std.append(1.96 * sem(errs_pe_hh))\n",
        "\n",
        "# Plotting the results with confidence intervals\n",
        "plt.figure(figsize=(8, 5))\n",
        "plt.plot(n_values, errors_PE, label='PE')\n",
        "plt.fill_between(n_values,\n",
        "                 np.array(errors_PE) - np.array(errors_PE_std),\n",
        "                 np.array(errors_PE) + np.array(errors_PE_std),\n",
        "                 alpha=0.3)\n",
        "\n",
        "plt.plot(n_values, errors_PE_HH, label='PE with Laplace noise+thresholding')\n",
        "plt.fill_between(n_values,\n",
        "                 np.array(errors_PE_HH) - np.array(errors_PE_HH_std),\n",
        "                 np.array(errors_PE_HH) + np.array(errors_PE_HH_std),\n",
        "                 alpha=0.3)\n",
        "\n",
        "plt.xlabel('Sample size $n$')\n",
        "plt.ylabel('Error in 1-Wasserstein')\n",
        "plt.legend()\n",
        "plt.tight_layout()\n",
        "plt.show()"
      ],
      "metadata": {
        "id": "grGDFRZi4QWE"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#Experiment parameters\n",
        "d = 2\n",
        "epsilon = 1\n",
        "delta = 10**-4\n",
        "diameter = 2\n",
        "n_values = np.arange(50, 2051, 200)\n",
        "\n",
        "repetitions = 100\n",
        "\n",
        "errors_PE = []\n",
        "errors_PE_HH = []\n",
        "errors_PE_std = []\n",
        "errors_PE_HH_std = []\n",
        "\n",
        "for n in n_values:\n",
        "    errs_pe = []\n",
        "    errs_pe_hh = []\n",
        "    for _ in range(repetitions):\n",
        "        data = points_in_unit_ball(n, d, 1/50) #data is in a small ball\n",
        "        errs_pe.append(PE(data, diameter, epsilon, delta))\n",
        "        errs_pe_hh.append(PE_Laplace(data, diameter, epsilon, delta))\n",
        "\n",
        "    errors_PE.append(np.mean(errs_pe))\n",
        "    errors_PE_HH.append(np.mean(errs_pe_hh))\n",
        "    errors_PE_std.append(1.96 * sem(errs_pe))\n",
        "    errors_PE_HH_std.append(1.96 * sem(errs_pe_hh))\n",
        "\n",
        "# Plotting with confidence intervals\n",
        "plt.figure(figsize=(8, 5))\n",
        "\n",
        "plt.plot(n_values, errors_PE, label='PE')\n",
        "plt.fill_between(n_values,\n",
        "                 np.array(errors_PE) - np.array(errors_PE_std),\n",
        "                 np.array(errors_PE) + np.array(errors_PE_std),\n",
        "                 alpha=0.3)\n",
        "\n",
        "plt.plot(n_values, errors_PE_HH, label='PE with Laplace noise+thresholding')\n",
        "plt.fill_between(n_values,\n",
        "                 np.maximum(np.array(errors_PE_HH) - np.array(errors_PE_HH_std),0),\n",
        "                 np.array(errors_PE_HH) + np.array(errors_PE_HH_std),\n",
        "                 alpha=0.3)\n",
        "\n",
        "plt.xlabel('Sample size $n$')\n",
        "plt.ylabel('Error in 1-Wasserstein')\n",
        "plt.legend()\n",
        "plt.tight_layout()\n",
        "plt.show()"
      ],
      "metadata": {
        "id": "IHPtzo8ySKRz"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Varying epsilon and with dimension"
      ],
      "metadata": {
        "id": "NJ6uGHPa4WQx"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "d = 2\n",
        "epsilons = np.arange(0.1, 1.1, 0.1)\n",
        "delta = 1e-4\n",
        "diameter = 2\n",
        "n = 1000\n",
        "repetitions = 100\n",
        "\n",
        "# Store all accuracies per epsilon across repetitions\n",
        "accuracies = np.zeros((repetitions, len(epsilons)))\n",
        "\n",
        "for r in range(repetitions):\n",
        "    for i, epsilon in enumerate(epsilons):\n",
        "        data = points_in_unit_ball(n, d)\n",
        "        w1 = PE(data, diameter, epsilon, delta)\n",
        "        accuracies[r, i] = w1\n",
        "\n",
        "# Compute means and standard errors\n",
        "means = np.mean(accuracies, axis=0)\n",
        "stds = np.std(accuracies, axis=0, ddof=1)\n",
        "conf95 = 1.96 * stds / np.sqrt(repetitions)  # 95% confidence interval\n",
        "\n",
        "# Plot mean ± confidence interval\n",
        "plt.plot(epsilons, means)\n",
        "plt.fill_between(\n",
        "    epsilons,\n",
        "    means - conf95,\n",
        "    means + conf95,\n",
        "    alpha=0.2,\n",
        ")\n",
        "\n",
        "plt.ylabel('Error in 1-Wasserstein')\n",
        "plt.xlabel('Epsilon')\n",
        "plt.show()"
      ],
      "metadata": {
        "id": "65MNTOVgB6Ye"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "ds = np.arange(1, 11, 1)\n",
        "epsilon = 1\n",
        "delta = 1e-4\n",
        "diameter = 2\n",
        "n = 1000\n",
        "repetitions = 100\n",
        "\n",
        "# Store all accuracies per dimension across repetitions\n",
        "accuracies = np.zeros((repetitions, len(ds)))\n",
        "\n",
        "for r in range(repetitions):\n",
        "    for i, d in enumerate(ds):\n",
        "        data = points_in_unit_ball(n, d)\n",
        "        w1 = PE(data, diameter, epsilon, delta)\n",
        "        accuracies[r, i] = w1\n",
        "\n",
        "# Compute means and 95% confidence intervals\n",
        "means = np.mean(accuracies, axis=0)\n",
        "stds = np.std(accuracies, axis=0, ddof=1)\n",
        "conf95 = 1.96 * stds / np.sqrt(repetitions)\n",
        "\n",
        "# Plot mean curve with shaded 95% confidence band\n",
        "plt.plot(ds, means)\n",
        "plt.fill_between(\n",
        "    ds,\n",
        "    means - conf95,\n",
        "    means + conf95,\n",
        "    alpha=0.2\n",
        ")\n",
        "\n",
        "plt.ylabel('Error in 1-Wasserstein')\n",
        "plt.xlabel('Dimension')\n",
        "plt.show()"
      ],
      "metadata": {
        "id": "F5MMxdIF9iTU"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "colab": {
      "collapsed_sections": [
        "uw_D99uJesW0",
        "UFgEZWE_E3ta",
        "psmsA7gdAJWD",
        "5GksVXC2ibIg",
        "PTfjXrqqTwdn",
        "VLTZu0FJe-fJ"
      ],
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}