{"metadata":{"celltoolbar":"Tags","kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.10.10","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Policy Laplace on Spark\n\nSpark implementation of Policy Laplace from [Differentially Private Set Union](https://arxiv.org/abs/2002.09745)\n\n<table align=\"left\">\n\n  <td>\n    <a href=\"https://colab.research.google.com/github/opendp/smartnoise-sdk/blob/main/sql/samples/Policy%20Laplace%20on%20Spark.ipynb\">\n      <img src=\"https://cloud.google.com/ml-engine/images/colab-logo-32px.png\" alt=\"Colab logo\"> Run in Colab\n    </a>\n  </td>\n  <td>\n    <a href=\"https://github.com/opendp/smartnoise-sdk/blob/main/sql/samples/Policy%20Laplace%20on%20Spark.ipynb\">\n      <img src=\"https://cloud.google.com/ml-engine/images/github-logo-32px.png\" alt=\"GitHub logo\">\n      View on GitHub\n    </a>\n  </td>\n  <td>\n    <a href=\"https://www.kaggle.com/notebooks/welcome?src=https://github.com/opendp/smartnoise-sdk/blob/main/sql/samples/Policy%20Laplace%20on%20Spark.ipynb\">\n      <img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAABJ0lEQVR4AezUgUYEURSH8c3IWkDIPkEI+wwhK/sMPUCSAYGQEBCwIBACyYKEIIAkrCQQkmSMJFnJYHD6cCJ/CdaZWQx+hjuX881wb8vM5uo0+wGtg8sxMnddR8AE5t6agCagCZj5APYkP6IDdF8fxzjFCUZIkYQHsD7AM+yXEnvhf4C1NR2OAvvohAb4b3/S4diFDw8K8OGPMvwLO2iHngKeKzocn9jGfOgxRIoHGT5BKsNDAkp8yPB3bCCJv4j+diRfXnlAhn5tAe4Oy1UFvOAMpayfYzE6oMA6FjCCiaHcAaEX0RLGMAncCg2Qd6vIJeIVg8gAfb+JQiLu0asqoI0hTFygGx7ge7o+0MQhOtMG3CJ3N//s6+EKOTJ/fg+kVxmf+aO9YwBrLxFeFqDOPAAAAABJRU5ErkJggg==\" alt=\"Kaggle logo\">\n      Run on Kaggle\n    </a>\n  </td>                                                                                               \n</table>","metadata":{}},{"cell_type":"code","source":"!pip install smartnoise-sql pyspark --quiet","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import pyspark\nimport numpy as np\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.functions import *\nspark = SparkSession.builder.getOrCreate()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from os import path\nif not path.exists('clean_askreddit.csv'):\n    if not path.exists('clean_askreddit.csv'):\n        !pip install wget\n        import wget\n        zip_path = 'https://github.com/joshua-oss/differentially-private-set-union/raw/master/data/clean_askreddit.csv.zip'\n        wget.download(zip_path)\n    import zipfile\n    with zipfile.ZipFile('clean_askreddit.csv.zip', 'r') as zip:\n        zip.extractall('.')\n    ","metadata":{"tags":["hide-input"],"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"filepath = \"clean_askreddit.csv\"\nreddit = spark.read.load(filepath, format=\"csv\", sep=\",\",inferSchema=\"true\", header=\"true\").dropna()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Prepare Data for Processing\n\nLoad the data from file and tokenize.  This code can be any caller-specific tokenization routine, and is independent of differential privacy.  Output RDD should include one list of tokens per row, but can have multiple rows per user, and does not need to be odered in any way.  This stage can be combined with other n_grams (e.g. 2-grams, 3-grams) and persisted to feed to DPSU.","metadata":{}},{"cell_type":"code","source":"import nltk\n\nn_grams = 1\ndistinct = True\n\ndef tokenize(user_post):\n    user, post = user_post\n    tokens = post.split(\" \")\n    if n_grams > 1:\n        tokens = list(nltk.ngrams(tokens, n_grams))\n        tokens = [\"_\".join(g) for g in tokens]\n    if distinct:\n        tokens = list(set(tokens))\n    return (user, tokens)\n        \ntokenized = reddit.select(\"author\", \"clean_text\").rdd.map(tokenize).persist()\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Instantiate DPSU Processor\n\nCreate the object and pass in the privacy parameters.","metadata":{}},{"cell_type":"code","source":"from __future__ import division\nfrom collections import defaultdict\nimport operator\nimport copy\nimport numpy as np\nimport itertools\nfrom pyspark.rdd import portable_hash\n\nclass PolicyLaplace:\n    def __init__(self, epsilon, delta, alpha, tokens_per_user, prune_tail_below=None, num_partitions=1):\n        Delta_0 = tokens_per_user\n        self.Delta_0 = Delta_0 # tokens_per_user\n        self.K = prune_tail_below\n        self.num_partitions = num_partitions if num_partitions is not None else 1\n        self.Delta = 1 / self.num_partitions  # budget per user\n        #if self.K == 1:\n        #    self.K = None\n\n        l_param = 1 / epsilon\n        F_l_rho = lambda t: 1 / t + (1 / epsilon) * np.log(1 / (2 * (1 - (1 - delta) ** (1 / t))))\n        l_rho = np.max([F_l_rho(t) for t in range(1, Delta_0 + 1)])\n        if self.K is not None:\n            l_rho = self.K + (1/epsilon) * np.log(1 / (2 * (1 - (1 - delta) ** (1 / Delta_0))))\n\n        Gamma=l_rho + alpha*l_param\n        self.Gamma = Gamma\n        self.l_param = l_param\n        self.l_rho = l_rho\n\n        print(\"Params Delta_0={0}, delta={1:.2e}, l_param={2}, l_rho={3}, Gamma={4}\".format(Delta_0, delta, l_param, l_rho, Gamma))\n\n    \n    def exceeds_threshold(self, val):\n        nval = val + np.random.laplace(0, self.l_param)\n        if nval > self.l_rho:\n            return True\n        else:\n            return False\n\n    def prune_tail(self, user_tokens_rdd):\n        \"\"\"Prunes the (user, tokens) RDD to eliminate all words that appear\n            fewer than prune_tail_below supplied at instantiation time.\n        \"\"\"\n        if self.K is None:\n            return user_tokens_rdd\n        tu = user_tokens_rdd.flatMap(lambda row: [(token, row[0]) for token in row[1]])\n        tu = tu.keyBy(lambda row: row[0])\n\n        ut = user_tokens_rdd.flatMap(lambda row: [(token, 1) for token in row[1]])\n        wc = ut.reduceByKey(operator.add)\n        wc = wc.filter(lambda row: (row[1] >= self.K))\n\n        filtered = wc.keyBy(lambda row: row[0]).join(tu).map(lambda row: row[1][1])\n        return filtered.map(lambda row: (row[1], row[0])).groupByKey()\n\n\n    def reservoir_sample(self, user_tokens_rdd, distinct=True):\n        \"\"\"Takes an RDD with (user, tokens) and combines all tokens from all users,\n            then samples uniformly to get at most tokens_per_user tokens.  User is not\n            assumed to be grouped or sorted on input stream, and users may appear more\n            than once, with different lists of tokens.\n        \"\"\"\n        tokens_per_user = self.Delta_0\n        if tokens_per_user == 1:\n            return user_tokens_rdd\n\n        def selected_grams(row):\n            np.random.seed()\n            user, tokens = row\n            all_grams = list(itertools.chain.from_iterable(tokens))\n            if distinct:\n                all_grams = list(set(all_grams))\n            if tokens_per_user > 1 and len(all_grams) > tokens_per_user:\n                selected = np.random.choice(all_grams, size=tokens_per_user, replace=False).tolist()\n            else:\n                selected = all_grams\n            return [(user, token) for token in selected]\n\n        return user_tokens_rdd.groupByKey().flatMap(selected_grams)\n\n    def process_partitions(self, user_tokens_rdd):\n        \"\"\"Repartitions into the desired number of partitions and\n            runs the DPSU algorithm in parallel.\"\"\"\n        process_rows = self.process_rows\n\n        parts = user_tokens_rdd.keyBy(lambda row: (row[0], row[1]))\n        n = self.num_partitions\n\n        def partition_func(key):\n            return portable_hash(key[1])\n\n        def key_func(entry):\n            return (entry[0], entry[1])\n\n        parts = parts.repartitionAndSortWithinPartitions(numPartitions=n, partitionFunc=partition_func, keyfunc=key_func)\n        parts = parts.map(lambda row: row[1])\n        #parts = user_tokens_rdd.groupByKey().repartition(self.num_partitions)\n        res = parts.mapPartitions(process_rows)\n        return res.reduceByKey(operator.add)\n        #return parts\n\n    def count_word(self, rows):\n        words = [word for user, word in list(rows)]\n        yield len(list(set(words)))\n\n    def process_rows(self, rows):\n        ngram_hist = defaultdict(float)\n        prev_user = None\n        user = None\n        token_buffer = []\n        while True:\n            try:\n                line = next(rows)\n            except StopIteration:\n                line = None\n            if line is not None:\n                user, token = line\n                if prev_user is None:\n                    prev_user = user\n            if line is not None and user == prev_user:\n                token_buffer.append(token)\n            else:  # user or stream is finished\n                if line is None:\n                    print(\"Final budget distribute with {0} left in buffer\".format(len(token_buffer)))\n                new_token_buffer = []\n                selected_ngrams = token_buffer\n                token_buffer = new_token_buffer\n                prev_user = user\n                gap_dict = {}\n\n                ngl = list(selected_ngrams)\n                for w in ngl:\n                    if ngram_hist[w] < self.Gamma:\n                        gap_dict[w] = self.Gamma - ngram_hist[w]\n                # sort rho dict\n                sorted_gap_dict = sorted(gap_dict.items(), key=operator.itemgetter(1))\n\n                sorted_gap_keys = [k for k, v in sorted_gap_dict]\n\n                budget = copy.copy(self.Delta)\n                total_tokens = len(sorted_gap_keys)\n\n                for i, w in enumerate(sorted_gap_keys):\n                    cost = gap_dict[w]*(total_tokens-i)\n                    if cost < budget:\n                        for j in range(i, total_tokens):\n                            add_gram = sorted_gap_keys[j]\n                            ngram_hist[add_gram] += gap_dict[w]\n                        # update remaining budget\n                        budget -= cost\n                        # update dictionary of values containing difference from gap\n                        for key in gap_dict: \n                            gap_dict[key] -= gap_dict[w] \n                    else:\n                        for j in range(i, total_tokens):\n                            add_gram = sorted_gap_keys[j]\n                            ngram_hist[add_gram] += budget/(total_tokens-i)\n                        break\n            if line is None:\n                break\n        print (\"Single partition histogram had {0} items\".format(len(ngram_hist.items())))\n        for k, v in ngram_hist.items():\n            yield (k, v)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"epsilon = 3.0\ndelta = np.exp(-10)\nalpha = 5.0\ntokens_per_user = 500\nprune_tail_below = None\nnum_partitions = 1\n\npl = PolicyLaplace(epsilon, delta, alpha, tokens_per_user, prune_tail_below, num_partitions)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# prune the tail\npruned = pl.prune_tail(tokenized)\n\n# reservoir sample the input tokens\nsampled = pl.reservoir_sample(pruned, distinct).persist()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"counted = pl.process_partitions(sampled)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"good = counted.filter(lambda row: pl.exceeds_threshold(row[1])).map(lambda row: row[0])\n\nprint(\"Retrieved {0} words from {1}\".format(good.count(),counted.count()))\nprint(good.take(5))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}