{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import hash_func\n",
    "import numpy as np\n",
    "import csv\n",
    "\n",
    "def count_sketch(filepath, limit, r, b, eps):\n",
    "    '''\n",
    "    Description:\n",
    "        Runs count sketch algorithm on IP addresses\n",
    "    Inputs:\n",
    "        filepath: file to run algorithm on\n",
    "        limit: number of addresses to process\n",
    "        r: number of algorithm repetitions\n",
    "        b: number of buckets (hyperparam)\n",
    "        eps: desired accuracy (hyperparam)\n",
    "    Outputs:\n",
    "        returns list of heavy hitters for filepath\n",
    "    '''\n",
    "    \n",
    "    heavy_hitters = []\n",
    "    buckets_list = np.zeros((r, b))\n",
    "    approx_norms = np.zeros(r)\n",
    "    unique_ips = set()\n",
    "    \n",
    "    with open(filepath) as csvfile:\n",
    "        stream = csv.reader(csvfile, delimiter=\",\")\n",
    "        stream.__next__() # get rid of the header\n",
    "        \n",
    "        for row in stream:\n",
    "            if (stream.line_num == limit):\n",
    "                break\n",
    "            try:\n",
    "                s_i = int(row[0])\n",
    "            except:\n",
    "                # row[0] is not a valid integer\n",
    "                continue\n",
    "            unique_ips.add(s_i)\n",
    "            # process the stream\n",
    "            for i in range(r):\n",
    "                curr_buckets = buckets_list[i]\n",
    "                v_i = 2*hash_func(i+100, s_i, 2) - 1\n",
    "                bucket = hash_func(i, s_i, b)\n",
    "                curr_buckets[bucket] += v_i\n",
    "                approx_norms[i] += v_i\n",
    "            # determine the threshold\n",
    "        thresh = eps * np.sqrt(np.median(approx_norms**2))\n",
    "        # determine the heavy hitters\n",
    "        for ip in unique_ips:\n",
    "            f_ip = []\n",
    "            for i in range(r):\n",
    "                sign = 2*hash_func(i+100, ip, 2) - 1\n",
    "                bucket = hash_func(i, ip, b)\n",
    "                f_ip.append(buckets_list[i][bucket]*sign)\n",
    "            est_norm = np.median(f_ip)\n",
    "            if (est_norm > thresh):\n",
    "                heavy_hitters.append(ip)\n",
    "        return heavy_hitters"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## AOL Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "hh_cs = count_sketch(filepath=\"aol-processed.csv\",\n",
    "                     limit=int(1e4),\n",
    "                     r=5,\n",
    "                     b=300,\n",
    "                     eps=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[69026,\n",
       " 103069,\n",
       " 116727,\n",
       " 122466,\n",
       " 251245,\n",
       " 251673,\n",
       " 268550,\n",
       " 289525,\n",
       " 339461,\n",
       " 411592,\n",
       " 444479,\n",
       " 483349,\n",
       " 545927,\n",
       " 558482,\n",
       " 585905,\n",
       " 668519,\n",
       " 693392,\n",
       " 740429,\n",
       " 768069,\n",
       " 902148,\n",
       " 921041,\n",
       " 1041196,\n",
       " 1042069,\n",
       " 1061480,\n",
       " 1082283,\n",
       " 1105372,\n",
       " 1235142,\n",
       " 1284480,\n",
       " 1291880,\n",
       " 1375429,\n",
       " 1466713,\n",
       " 1524559,\n",
       " 1636218,\n",
       " 1719567,\n",
       " 1772668,\n",
       " 1814348,\n",
       " 1836977,\n",
       " 1926178,\n",
       " 1995656,\n",
       " 2074803,\n",
       " 2323675,\n",
       " 2337115,\n",
       " 2609996,\n",
       " 2700537,\n",
       " 2747414,\n",
       " 2751788,\n",
       " 2765859,\n",
       " 2782943,\n",
       " 2820656,\n",
       " 2907392,\n",
       " 3147979,\n",
       " 3163738,\n",
       " 3260837,\n",
       " 3318459,\n",
       " 3406329,\n",
       " 3547648]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hh_cs = sorted(hh_cs)\n",
    "hh_cs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
