{"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.10.10","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Private Synopsis\n\n<table align=\"left\">\n\n  <td>\n    <a href=\"https://colab.research.google.com/github/opendp/smartnoise-sdk/blob/main/sql/samples/Synopsis.ipynb\">\n      <img src=\"https://cloud.google.com/ml-engine/images/colab-logo-32px.png\" alt=\"Colab logo\"> Run in Colab\n    </a>\n  </td>\n  <td>\n    <a href=\"https://github.com/opendp/smartnoise-sdk/blob/main/sql/samples/Synopsis.ipynb\">\n      <img src=\"https://cloud.google.com/ml-engine/images/github-logo-32px.png\" alt=\"GitHub logo\">\n      View on GitHub\n    </a>\n  </td>\n  <td>\n    <a href=\"https://www.kaggle.com/notebooks/welcome?src=https://github.com/opendp/smartnoise-sdk/blob/main/sql/samples/Synopsis.ipynb\">\n      <img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAABJ0lEQVR4AezUgUYEURSH8c3IWkDIPkEI+wwhK/sMPUCSAYGQEBCwIBACyYKEIIAkrCQQkmSMJFnJYHD6cCJ/CdaZWQx+hjuX881wb8vM5uo0+wGtg8sxMnddR8AE5t6agCagCZj5APYkP6IDdF8fxzjFCUZIkYQHsD7AM+yXEnvhf4C1NR2OAvvohAb4b3/S4diFDw8K8OGPMvwLO2iHngKeKzocn9jGfOgxRIoHGT5BKsNDAkp8yPB3bCCJv4j+diRfXnlAhn5tAe4Oy1UFvOAMpayfYzE6oMA6FjCCiaHcAaEX0RLGMAncCg2Qd6vIJeIVg8gAfb+JQiLu0asqoI0hTFygGx7ge7o+0MQhOtMG3CJ3N//s6+EKOTJ/fg+kVxmf+aO9YwBrLxFeFqDOPAAAAABJRU5ErkJggg==\" alt=\"Kaggle logo\">\n      Run on Kaggle\n    </a>\n  </td>                                                                                               \n</table>","metadata":{}},{"cell_type":"code","source":"!pip install smartnoise-sql pyspark --quiet","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import pyspark\nimport numpy as np\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.functions import *\nspark = SparkSession.builder.getOrCreate()\nspark.sparkContext.setLogLevel(\"ERROR\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Download the Data","metadata":{}},{"cell_type":"code","source":"from pathlib import Path\nurl_file = 'https://raw.githubusercontent.com/opendifferentialprivacy/dp-test-datasets/master/data/PUMS_california_demographics/data.csv'\nlocal_file = Path('PUMS_large.csv')\n\n!wget -nc {url_file} -O {str(local_file)}    ","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Load the Data\n\nFirst we load the California demographic data into a Spark `DataFrame`.  We let Spark infer the column names and types, then clean things up a bit.","metadata":{}},{"cell_type":"code","source":"from pyspark.sql.types import FloatType, BooleanType\n\nfilepath = str(local_file)\npums = spark.read.load(filepath, format=\"csv\", sep=\",\",inferSchema=\"true\", header=\"true\")\n\npums = pums.withColumnRenamed(\"_c0\", \"PersonID\")\n\npums = pums.withColumn(\"income\", col(\"income\").cast(FloatType()))\npums = pums.withColumn(\"latino\", col(\"latino\").cast(BooleanType()))\npums = pums.withColumn(\"black\", col(\"black\").cast(BooleanType()))\npums = pums.withColumn(\"asian\", col(\"asian\").cast(BooleanType()))\npums = pums.withColumn(\"married\", col(\"married\").cast(BooleanType()))\n\npums.show(5)\nprint(\"There are {0} individuals in the data\".format(pums.count()))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from snsql import *\nfrom pathlib import Path\n\nmetadata_path = Path(\"./PUMS_large.yaml\")\nmetadata_string = \"\"\"PUMS:\n  PUMS:\n    PUMS_large:\n      clamp_counts: False\n      censor_dims: False\n      PersonID:\n        type: int\n        private_id: True\n      state:\n        type: int\n      puma:\n        type: string\n      sex:\n        type: string\n      age:\n        lower: 0\n        upper: 100\n        type: int\n      educ:\n        type: string\n      income:\n        lower: 0\n        upper: 500000\n        type: float\n      latino:\n        type: boolean\n      black:\n        type: boolean\n      asian:\n        type: boolean\n      married:\n        type: boolean\n\"\"\"\nwith open(metadata_path, \"w+\") as f:\n    f.write(metadata_string)\n\npums.createOrReplaceTempView(\"PUMS_large\")\n\nprivate_reader = from_connection(\n    spark, \n    metadata=str(metadata_path), \n    privacy=Privacy(epsilon=3.0, delta=1/1_000_000)\n)\nprivate_reader.reader.compare.search_path = [\"PUMS\"]\n\n\nres = private_reader.execute('SELECT COUNT(*) FROM PUMS_large')\nres.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Each row in the data represents a unique individual.\n\n# Get Exact Values for Comparison\n\nThe `private_reader` wraps an existing SparkSQL session and applies differential privacy.  We can access the underlying reader to get exact results with no differential privacy.  This is useful for comparing utility.  For example, we can compute the average income for individuals in the database.","metadata":{}},{"cell_type":"code","source":"query = 'SELECT COUNT(*) AS count, AVG(income) FROM PUMS_large'\n\nreader = private_reader.reader # the underlying connection to Spark SQL\n\nres = reader.execute(query)\nres.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Generate Exact Synopsis\n\nWe can use the `SparkReader` to create a synopsis file that calculates some metrics grouped by the dimensions in the data.  We can then load the synopsis into an Excel spreadsheet to use in a Pivot Table, or query the synopsis from Pandas.","metadata":{}},{"cell_type":"code","source":"query = 'SELECT sex, age, educ, married, latino, black, asian, COUNT(*) AS n, AVG(income) AS income FROM PUMS_large GROUP BY sex, age, educ, married, latino, black, asian ORDER BY n DESC'\n\nsynopsis = private_reader.reader.execute(query)\nsynopsis.show(15)\nprint(\"{0} distinct dimensions\".format(synopsis.count()))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"We now have far fewer rows, but we can still recover the exact values.  For example, the average income queried from our synopsis exactly matches the average income we queried above:","metadata":{}},{"cell_type":"code","source":"synopsis.createOrReplaceTempView(\"Synopsis\")\n\nres = reader.execute(\"SELECT SUM(n) AS count, SUM(income * n) / SUM(n) AS avg_income FROM Synopsis\")\nres.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Note that we have gone from 1.2 million rows to approximately 20,000 rows, so each row in our synopsis no longer represents an individual.  However, we have still not attempted to use any differential privacy, so our synopsis is not private.  For example, there are several dimensions in our synopsis which uniquely identify individuals.","metadata":{}},{"cell_type":"code","source":"reader.execute(\"SELECT * FROM Synopsis WHERE n <= 1\").show(5)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Additionally, cells with exact counts > 1 can still leak privacy.  To protect against these and other attacks, let's make our synopsis private.\n\n# Generate Private Synopsis\n\nTo generate a private synopsis, we use the same query we used to create the exact synopsis, but we use a `PrivateReader`, which transparently adds differential privacy.","metadata":{}},{"cell_type":"code","source":"from pyspark.sql.types import DecimalType\n\nprivate_synopsis = private_reader.execute(query) # using same query from before\nprivate_synopsis = private_synopsis.withColumn('income', private_synopsis.income.cast(DecimalType(18, 2)))\nprivate_synopsis.show(15)\nprint(\"{0} distinct dimensions\".format(private_synopsis.count()))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Because the `PrivateReader` transparently adds noise, you will notice that the numbers change each time you run the cell above, sometimes even returning negative counts or negative incomes.  However, the larger aggregates are still fairly accurate, because the noise is symmetrical:","metadata":{}},{"cell_type":"code","source":"private_synopsis.persist().createOrReplaceTempView(\"PrivateSynopsis\")\n\nreader.execute(\"SELECT SUM(n) AS count, SUM(income * n) / SUM(n) AS avg_income FROM PrivateSynopsis\").show()\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Notice that we call `persist()` when loading the private synopsis into a view.  This is how we ensure that Spark doesn't generate a new synopsis every time we query the synopsis.  The goal of a synopsis is to support many queries from a single generation, and we do not want to pay additional `epsilon` privacy cost every time we use the synopsis.  If we create the synopsis once, we can export to Excel or persist in a view, then query indefinitely without incurring further privacy cost.\n\n## PrivateReader Parameters\n\nWhen we created the `PrivateReader` above, we passed in the `epsilon` parameter and wrapped our existing `SparkReader` we created earlier.  The `PrivateReader` simply intercepts calls to `SparkReader` and adds noise calibrated to the requested `epsilon`.  We also passed in some metadata describing the sensitivity of the fields in the data source, loaded from a YAML file.  In particular, the algorithm needed to know that the `income` field ranges between 0 and 500,000, in order to appropriately calibrate the noise:","metadata":{}},{"cell_type":"code","source":"import snsql\nmeta = snsql.metadata.Metadata.from_file(metadata_path)\nprint(meta)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Also notice that the YAML file refers to the PUMS_large table with the prefix PUMS, which is a convention used in the SQL-92 specification allowing tables and views to be grouped together.  Although we are only querying a single source table here, the readers support querying over multiple tables.  Since our query does not specify the full disambiguated table name, we tell our reader to treat PUMS as a default namespace by specifying `private.reader.compare.search_path`.\n\nYou can read more about the other `PrivateReader` options [here](https://opendifferentialprivacy.github.io/smartnoise-samples/docs/api/system/sql/private_reader.html#opendp.smartnoise.sql.private_reader.PrivateReaderOptions)\n\n# Censoring Infrequent Dimensions\n\nOne option worth exploring further is the `censor_dims` option we set to `False` above.  Recall that the number of distinct dimensions in our private synopsis was exactly the same as the number of distinct dimesions in our exact synopsis.  In our exact synopsis, the existence of dimensions with exactly one member constituted a privacy leak.  Since we have added noise, dimensions with fewer than 2 members are significantly less useful:\n","metadata":{}},{"cell_type":"code","source":"reader.execute(\"SELECT * FROM PrivateSynopsis WHERE n <= 1\").show(8)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"However, it is possible that the mere existence of a unique dimension combination in the data set would constitute a privacy failure.  For example, if this data represented people with a sensitive medical condition, mere membership would sensitive.  If we want to protect the queries further, we can tell the system to hide infrequent dimensions, adhering to epsilon, delta differential privacy.","metadata":{}},{"cell_type":"code","source":"print(query)\ndelta = 1/1_200_000\n\nmeta[\"PUMS.PUMS_large\"].censor_dims = True\n\nprivate_reader = from_connection(\n    spark, \n    metadata=meta,\n    privacy=Privacy(epsilon=3.0, delta=delta)\n)\nprivate_reader.reader\nprivate_reader.reader.compare.search_path = [\"PUMS\"]\n\n\nprivate_synopsis = private_reader.execute(query)\nprint(\"{0} distinct dimensions\".format(private_synopsis.count()))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"private_synopsis.persist().createOrReplaceTempView(\"PrivateSynopsis\")\n\nreader.execute(\"SELECT SUM(n) AS count, SUM(income * n) / SUM(n) AS avg_income FROM PrivateSynopsis\").show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"From the outputs, you can see the private synopsis still computes an accurate average income, but we are missing about 6,500 dimensions, representing about 12,000 individuals.  It may be desirable to leave the synopsis like this, to indicate that some individuals have been dropped for privacy reasons.  In some settings, however, this is undesirable, because aggregate counts will be biased downward.  To resolve this, we can add an `other` dimension that recaptures the dropped dimension.\n\n## Recovering Infrequent Dimensions","metadata":{}},{"cell_type":"code","source":"# get the dimensions\nother = 'SELECT DISTINCT sex, age, educ, married, latino, black, asian FROM PUMS_large EXCEPT (SELECT DISTINCT sex, age, educ, married, latino, black, asian FROM PrivateSynopsis)'\nother_dims = reader.execute(other)\nother_dims.persist().createOrReplaceTempView(\"OtherDims\")\nprint(\"Combining {0} distinct dimensions that were dropped.\".format(other_dims.count()))\n\n# get rows that match censored dimensions\nfiltered = 'SELECT t1.* FROM PUMS_large t1 LEFT SEMI JOIN OtherDims t2 ON ( t1.sex = t2.sex AND t1.age = t2.age AND t1.educ = t2.educ AND t1.married = t2.married AND t1.latino = t2.latino AND t1.black = t2.black AND t1.asian = t2.asian)'\nfiltered_pums = reader.execute(filtered)\nfiltered_pums.persist().createOrReplaceTempView(\"PUMS_censored\")\nprint(\"Selecting {0} records from the dimensions that were censored\".format(filtered_pums.count()))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"We now have a table, `PUMS_censored`, which has all the records which were censored from our private synopsis.  We can create a differentially private result, treating all of our censored dimensions as a single `other` dimension.  To query these remaining records, we need metadata that describes the new table, `PUMS_censored`.  Since this has the same schema as `PUMS_large`, we can grab the original schema and make a copy for the new table.","metadata":{}},{"cell_type":"code","source":"import copy\npc = copy.copy(meta.m_tables['PUMS.PUMS_large'])\npc.name = 'PUMS_censored'\nmeta.m_tables['PUMS.PUMS_censored'] = pc","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"query_single = 'SELECT COUNT(*) AS n, AVG(income) AS income FROM PUMS_censored'\nmissing_dim = private_reader.execute(query_single).persist()\nmissing_dim.createOrReplaceTempView(\"Censored\")\nmissing_dim.show()\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Noisy Values for Missing Dimensions\n\nAnother option is to create a private synopsis for all possible dimension values, where missing values will be set to NULL, which will result in zero counts.  These zero counts will result in zero values.  This approach is not suitable in settings where rare dimensions are private, such as surnames, or when the cross product of all dimensions is very large.  In this case, however, the distinct dimension members are not private, and the cross product is not large.","metadata":{}},{"cell_type":"code","source":"alldims = 'SELECT s.*, a.*, e.*, m.*, l.*, b.*, asi.* \\\n    FROM (SELECT DISTINCT sex FROM PUMS_large) s \\\n    CROSS JOIN (SELECT DISTINCT age FROM PUMS_large) a \\\n    CROSS JOIN (SELECT DISTINCT educ FROM PUMS_large) e \\\n    CROSS JOIN (SELECT DISTINCT married FROM PUMS_large) m \\\n    CROSS JOIN (SELECT DISTINCT latino FROM PUMS_large) l \\\n    CROSS JOIN (SELECT DISTINCT black FROM PUMS_large) b \\\n    CROSS JOIN (SELECT DISTINCT asian FROM PUMS_large) asi'\n\nall_dims = reader.execute(alldims)\nall_dims.persist().createOrReplaceTempView(\"AllDims\")\n\nprint(\"Including empty dimensions, there are {0} total dimensions\".format(all_dims.count()))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Adding the empty dimensions increases our total number of dimensions by about 16,000.","metadata":{}},{"cell_type":"code","source":"joined = 'SELECT p.PersonID, p.state, p.puma, d.sex, d.age, d.educ, d.latino, d.black, d.asian, d.married, p.income \\\n    FROM AllDims d LEFT OUTER JOIN PUMS_large p ON \\\n        d.sex = p.sex AND \\\n        d.age = p.age AND \\\n        d.educ = p.educ AND \\\n        d.latino = p.latino AND \\\n        d.black = p.black AND \\\n        d.asian = p.asian AND \\\n        d.married = p.married'\n\njoined_pums = reader.execute(joined).persist()\njoined_pums.createOrReplaceTempView(\"PUMS_joined\")\nprint(\"There are {0} rows, including empty dimensions\".format(joined_pums.count()))\n\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"pc = copy.copy(meta.m_tables['PUMS.PUMS_large'])\npc.name = 'PUMS_joined'\nmeta.m_tables['PUMS.PUMS_joined'] = pc","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"meta[\"PUMS.PUMS_large\"].censor_dims = False\nmeta[\"PUMS.PUMS_large\"].clamp_counts = False\nmeta[\"PUMS.PUMS_large\"].row_privacy = True\n\n\nq = 'SELECT sex, age, educ, married, latino, black, asian, COUNT(*) AS n, AVG(income) AS income FROM PUMS_joined GROUP BY sex, age, educ, married, latino, black, asian'\n\npriv2 = private_reader.execute(q).persist()\npriv2.createOrReplaceTempView(\"PrivateSynopsis2\")\nprint(\"The new private synopsis has {0} dimensions\".format(priv2.count()))\nreader.execute(\"SELECT SUM(n) AS count, SUM(income * n) / SUM(n) AS avg_income FROM PrivateSynopsis2\").show()\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]}]}