{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CountSketch (CS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import hash_func\n",
    "import numpy as np\n",
    "import csv\n",
    "import ipaddress\n",
    "\n",
    "def count_sketch(filepath, limit, r, b, eps):\n",
    "    '''\n",
    "    Description:\n",
    "        Runs count sketch algorithm on IP addresses\n",
    "    Inputs:\n",
    "        filepath: file to run algorithm on\n",
    "        limit: number of addresses to process\n",
    "        r: number of algorithm repetitions\n",
    "        b: number of buckets (hyperparam)\n",
    "        eps: desired accuracy (hyperparam)\n",
    "    Outputs:\n",
    "        returns list of heavy hitters for filepath\n",
    "    '''\n",
    "    \n",
    "    heavy_hitters = []\n",
    "    buckets_list = np.zeros((r, b))\n",
    "    approx_norms = np.zeros(r)\n",
    "    unique_ips = set()\n",
    "    \n",
    "    with open(filepath) as csvfile:\n",
    "        stream = csv.reader(csvfile, delimiter=\",\")\n",
    "        stream.__next__() # get rid of the header\n",
    "        \n",
    "        for row in stream:\n",
    "            if (stream.line_num == limit):\n",
    "                break\n",
    "            try:\n",
    "                s_i = int(ipaddress.ip_address(row[0]))\n",
    "            except:\n",
    "                # row[0] is not an IPv4 or IPv6 address\n",
    "                continue\n",
    "            unique_ips.add(s_i)\n",
    "            # process the stream\n",
    "            for i in range(r):\n",
    "                curr_buckets = buckets_list[i]\n",
    "                v_i = 2*hash_func(i+100, s_i, 2) - 1\n",
    "                bucket = hash_func(i, s_i, b)\n",
    "                curr_buckets[bucket] += v_i\n",
    "                approx_norms[i] += v_i\n",
    "            # determine the threshold\n",
    "        thresh = eps * np.sqrt(np.median(approx_norms**2))\n",
    "        # determine the heavy hitters\n",
    "        for ip in unique_ips:\n",
    "            f_ip = []\n",
    "            for i in range(r):\n",
    "                sign = 2*hash_func(i+100, ip, 2) - 1\n",
    "                bucket = hash_func(i, ip, b)\n",
    "                f_ip.append(buckets_list[i][bucket]*sign)\n",
    "            est_norm = np.median(f_ip)\n",
    "            if (est_norm > thresh):\n",
    "                heavy_hitters.append(ip)\n",
    "        return heavy_hitters"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CAIDA Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "hh_cs = count_sketch(filepath=\"CAIDA_12mins_sender_IPs/125910_ip_timestamps.csv\",\n",
    "                     limit=int(1e5),\n",
    "                     r=5,\n",
    "                     b=300,\n",
    "                     eps=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[2401796176,\n",
       " 3573875943,\n",
       " 879396542,\n",
       " 1572803056,\n",
       " 3451786411,\n",
       " 1298962670,\n",
       " 607033987,\n",
       " 2497912303,\n",
       " 3339135037,\n",
       " 887074222,\n",
       " 1115991985,\n",
       " 2613392396,\n",
       " 1604957565,\n",
       " 1072021340,\n",
       " 2987213905,\n",
       " 3124349474,\n",
       " 3124349526,\n",
       " 517524860,\n",
       " 595152676,\n",
       " 3568914218,\n",
       " 2839797836,\n",
       " 1039166085,\n",
       " 1115974613,\n",
       " 3174560376,\n",
       " 3124327772,\n",
       " 3016127925]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hh_cs"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
