{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Differential Privacy (DP) Private Aggregate Seeded Synthesizer\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> Example based on [Synthetic Data Showcase - _pac-synth_](https://github.com/microsoft/synthetic-data-showcase/blob/main/packages/lib-pacsynth/samples/dp_aggregate_seeded_short_example.ipynb).\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from snsynth.aggregate_seeded import (\n",
    "    AggregateSeededSynthesizer,\n",
    "    AccuracyMode,\n",
    "    FabricationMode,\n",
    "    AggregateSeededDataset,\n",
    ")\n",
    "from snsynth.transform.table import NoTransformer\n",
    "\n",
    "from utils import gen_data_frame\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generating an example data frame with random data\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "number_of_records_to_generate = 6000\n",
    "\n",
    "sensitive_df = gen_data_frame(number_of_records_to_generate)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generating the synthetic data\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "reporting_length = 4\n",
    "\n",
    "synth = AggregateSeededSynthesizer(\n",
    "    reporting_length=reporting_length,\n",
    "    epsilon=4.0,\n",
    "    accuracy_mode=AccuracyMode.prioritize_long_combinations(),\n",
    "    fabrication_mode=FabricationMode.uncontrolled(),\n",
    "    use_synthetic_counts=True,\n",
    ")\n",
    "\n",
    "synth.fit(sensitive_df, transformer=NoTransformer())\n",
    "\n",
    "synthetic_df = synth.sample(synth.get_dp_number_of_records())\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generating/exporting aggregate data\n",
    "\n",
    "This illustrates how to generate aggregates directly from the sensitive and synthetic data, as well as how to access the DP aggregates.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "sensitive_aggregates = synth.get_sensitive_aggregates(\";\")\n",
    "\n",
    "dp_aggregates = synth.get_dp_aggregates(\";\")\n",
    "\n",
    "synthetic_aggregates = AggregateSeededDataset.from_data_frame(\n",
    "    synthetic_df\n",
    ").get_aggregates(reporting_length, \";\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluating\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>H1</th>\n",
       "      <th>H2</th>\n",
       "      <th>H3</th>\n",
       "      <th>H4</th>\n",
       "      <th>H5</th>\n",
       "      <th>H6</th>\n",
       "      <th>H7</th>\n",
       "      <th>H8</th>\n",
       "      <th>H9</th>\n",
       "      <th>H10</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>6000.000000</td>\n",
       "      <td>6000.000000</td>\n",
       "      <td>6000.000000</td>\n",
       "      <td>6000.000000</td>\n",
       "      <td>6000.000000</td>\n",
       "      <td>6000.000000</td>\n",
       "      <td>6000.000000</td>\n",
       "      <td>6000.000000</td>\n",
       "      <td>6000.000000</td>\n",
       "      <td>6000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>1.001333</td>\n",
       "      <td>2.645167</td>\n",
       "      <td>4.596000</td>\n",
       "      <td>0.500500</td>\n",
       "      <td>0.493833</td>\n",
       "      <td>0.503333</td>\n",
       "      <td>0.505000</td>\n",
       "      <td>0.506167</td>\n",
       "      <td>0.506667</td>\n",
       "      <td>0.492500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.819417</td>\n",
       "      <td>2.103568</td>\n",
       "      <td>3.322994</td>\n",
       "      <td>0.500041</td>\n",
       "      <td>0.500004</td>\n",
       "      <td>0.500031</td>\n",
       "      <td>0.500017</td>\n",
       "      <td>0.500004</td>\n",
       "      <td>0.499997</td>\n",
       "      <td>0.499985</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>2.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>2.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                H1           H2           H3           H4           H5  \\\n",
       "count  6000.000000  6000.000000  6000.000000  6000.000000  6000.000000   \n",
       "mean      1.001333     2.645167     4.596000     0.500500     0.493833   \n",
       "std       0.819417     2.103568     3.322994     0.500041     0.500004   \n",
       "min       0.000000     0.000000     0.000000     0.000000     0.000000   \n",
       "25%       0.000000     1.000000     2.000000     0.000000     0.000000   \n",
       "50%       1.000000     3.000000     5.000000     1.000000     0.000000   \n",
       "75%       2.000000     4.000000     8.000000     1.000000     1.000000   \n",
       "max       2.000000     6.000000    10.000000     1.000000     1.000000   \n",
       "\n",
       "                H6           H7           H8           H9          H10  \n",
       "count  6000.000000  6000.000000  6000.000000  6000.000000  6000.000000  \n",
       "mean      0.503333     0.505000     0.506167     0.506667     0.492500  \n",
       "std       0.500031     0.500017     0.500004     0.499997     0.499985  \n",
       "min       0.000000     0.000000     0.000000     0.000000     0.000000  \n",
       "25%       0.000000     0.000000     0.000000     0.000000     0.000000  \n",
       "50%       1.000000     1.000000     1.000000     1.000000     0.000000  \n",
       "75%       1.000000     1.000000     1.000000     1.000000     1.000000  \n",
       "max       1.000000     1.000000     1.000000     1.000000     1.000000  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sensitive_df.replace(\"\", \"0\").astype(\"int\").describe()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>H1</th>\n",
       "      <th>H2</th>\n",
       "      <th>H3</th>\n",
       "      <th>H4</th>\n",
       "      <th>H5</th>\n",
       "      <th>H6</th>\n",
       "      <th>H7</th>\n",
       "      <th>H8</th>\n",
       "      <th>H9</th>\n",
       "      <th>H10</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>6099.000000</td>\n",
       "      <td>6099.000000</td>\n",
       "      <td>6099.000000</td>\n",
       "      <td>6099.000000</td>\n",
       "      <td>6099.000000</td>\n",
       "      <td>6099.000000</td>\n",
       "      <td>6099.000000</td>\n",
       "      <td>6099.000000</td>\n",
       "      <td>6099.000000</td>\n",
       "      <td>6099.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.934415</td>\n",
       "      <td>2.418593</td>\n",
       "      <td>4.207903</td>\n",
       "      <td>0.475652</td>\n",
       "      <td>0.464502</td>\n",
       "      <td>0.484670</td>\n",
       "      <td>0.481226</td>\n",
       "      <td>0.480571</td>\n",
       "      <td>0.482866</td>\n",
       "      <td>0.461223</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.823738</td>\n",
       "      <td>2.152071</td>\n",
       "      <td>3.444694</td>\n",
       "      <td>0.499448</td>\n",
       "      <td>0.498779</td>\n",
       "      <td>0.499806</td>\n",
       "      <td>0.499688</td>\n",
       "      <td>0.499663</td>\n",
       "      <td>0.499747</td>\n",
       "      <td>0.498535</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>2.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>2.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                H1           H2           H3           H4           H5  \\\n",
       "count  6099.000000  6099.000000  6099.000000  6099.000000  6099.000000   \n",
       "mean      0.934415     2.418593     4.207903     0.475652     0.464502   \n",
       "std       0.823738     2.152071     3.444694     0.499448     0.498779   \n",
       "min       0.000000     0.000000     0.000000     0.000000     0.000000   \n",
       "25%       0.000000     0.000000     1.000000     0.000000     0.000000   \n",
       "50%       1.000000     2.000000     4.000000     0.000000     0.000000   \n",
       "75%       2.000000     4.000000     7.000000     1.000000     1.000000   \n",
       "max       2.000000     6.000000    10.000000     1.000000     1.000000   \n",
       "\n",
       "                H6           H7           H8           H9          H10  \n",
       "count  6099.000000  6099.000000  6099.000000  6099.000000  6099.000000  \n",
       "mean      0.484670     0.481226     0.480571     0.482866     0.461223  \n",
       "std       0.499806     0.499688     0.499663     0.499747     0.498535  \n",
       "min       0.000000     0.000000     0.000000     0.000000     0.000000  \n",
       "25%       0.000000     0.000000     0.000000     0.000000     0.000000  \n",
       "50%       0.000000     0.000000     0.000000     0.000000     0.000000  \n",
       "75%       1.000000     1.000000     1.000000     1.000000     1.000000  \n",
       "max       1.000000     1.000000     1.000000     1.000000     1.000000  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "synthetic_df.replace(\"\", \"0\").astype(\"int\").describe()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.13 ('smartnoise')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "ac3c92e511987caa1f56186b87ca2e4f619945ea5b2030935a310f137c3edec0"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
