{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import pickle\n",
    "from scipy.sparse import load_npz\n",
    "from sklearn.linear_model import LogisticRegression, Ridge\n",
    "from importlib import reload\n",
    "import model.fixed_pref_influence_model as im\n",
    "import model.multi_cause_influence as causal\n",
    "import model.pif_regression as pif\n",
    "import model.pmf as pmf\n",
    "import model.network_model as nm\n",
    "from scipy.stats import gamma, poisson, bernoulli\n",
    "from sklearn.metrics import mean_squared_error as mse\n",
    "\n",
    "datadir = os.path.join('..', 'dat', 'pokec', 'regional_subset')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def snowball_sample(A, size=2000):\n",
    "    sampled_users = set()\n",
    "    users = np.arange(A.shape[0])\n",
    "    np.random.shuffle(users)\n",
    "    u_iter = 0\n",
    "    while len(sampled_users) < size:\n",
    "        user = users[u_iter]\n",
    "        friends = np.nonzero(A[u_iter, :])[0]\n",
    "        sampled_users.add(user)\n",
    "        sampled_users |= set(list(friends))\n",
    "        u_iter+=1\n",
    "    return np.array(list(sampled_users))\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_set_overlap(Beta_p, Beta, k=50):\n",
    "    top = np.argsort(Beta)[-k:]\n",
    "    top_p = np.argsort(Beta_p)[-k:]\n",
    "    return np.intersect1d(top, top_p).shape[0]/np.union1d(top, top_p).shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_user_item_embeddings(one_hot_region_encoding, num_regions=3, confounding_strength=2., item_confounding=2., \n",
    "                              M=1000, gamma_shp=2., gamma_scale=0.1):\n",
    "    user_region_shp = (num_regions*gamma_shp*confounding_strength)/(confounding_strength + (num_regions-1))\n",
    "    Z = one_hot_region_encoding*gamma.rvs(user_region_shp, scale=gamma_scale, size=(users.shape[0], num_regions))\n",
    "    Z += (1-one_hot_region_encoding)*gamma.rvs(user_region_shp/confounding_strength, \n",
    "                                               scale=gamma_scale, size=(users.shape[0], num_regions))\n",
    "    \n",
    "    item_region_shp = (num_regions*gamma_shp*item_confounding)/(item_confounding + (num_regions-1))\n",
    "    item_encoding = np.zeros((M, num_regions))\n",
    "    item_encoding[np.arange(M),np.random.randint(0,num_regions,size=M)] = 1\n",
    "    Gamma = item_encoding*gamma.rvs(item_region_shp, scale=gamma_scale, size=(M, num_regions))\n",
    "    Gamma += (1-item_encoding)*gamma.rvs(item_region_shp/item_confounding, scale=gamma_scale, size=(M, num_regions))\n",
    "    \n",
    "    return Z, Gamma"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_embeddings(K=3, M=500, gamma_shp=2., gamma_scale=0.1):  \n",
    "    E = gamma.rvs(gamma_shp, scale=gamma_scale, size=(M, K))\n",
    "    return E"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1273994, 2), 78982)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "arr = np.load(os.path.join(datadir, 'pokec_links.npz'))\n",
    "links = arr['edge_list']\n",
    "links.shape, np.unique(links).shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(os.path.join(datadir, 'profiles.pkl'), 'rb') as f:\n",
    "    profiles = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "uids = profiles.index.values\n",
    "age_bins = np.arange(0,100,step=10)\n",
    "profiles['age_binned'] = np.digitize(profiles.age.values, age_bins)\n",
    "profiles['region_categorical'] = pd.Categorical(profiles.region).codes\n",
    "profiles['reg_year'] = profiles.registration.dt.year\n",
    "code = {r:i for (i,r) in enumerate(np.unique(profiles['region_categorical']))}\n",
    "profiles['region_categorical'] = profiles['region_categorical'].apply(lambda x: code[x])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "covariate = 'age_binned'\n",
    "label_users = uids[links[:, 0]]\n",
    "feature_users = uids[links[:,1]]\n",
    "labels = profiles.loc[label_users, covariate].values\n",
    "features = profiles.loc[feature_users, covariate].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n",
      "  \"of iterations.\", ConvergenceWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.3793848322676559"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features = features[:,np.newaxis]\n",
    "model = LogisticRegression(solver='lbfgs', multi_class='multinomial')\n",
    "model.fit(features, labels)\n",
    "model.score(features, labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.sparse import csr_matrix\n",
    "row_inds = links[:,0]\n",
    "col_inds = links[:,1]\n",
    "data = np.ones(row_inds.shape[0])\n",
    "A = csr_matrix((data, (row_inds, col_inds)))\n",
    "A = A.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2026, 2026)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "users = snowball_sample(A, size=2000)\n",
    "A = A[users,:]\n",
    "A = A[:,users]\n",
    "A.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "region_codes = np.unique(profiles['region_categorical'])\n",
    "num_regions = len(region_codes)\n",
    "one_hot_region_encoding = np.zeros((users.shape[0], num_regions))\n",
    "regions = profiles.loc[uids[users], 'region_categorical'].values\n",
    "u_idx = np.arange(users.shape[0])\n",
    "one_hot_region_encoding[u_idx,regions] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2026, 10)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ages = np.unique(profiles['age_binned'])\n",
    "num_bins = len(ages)\n",
    "one_hot_age_encoding = np.zeros((users.shape[0], num_bins))\n",
    "age_binned = profiles.loc[uids[users], 'age_binned'].values\n",
    "u_idx = np.arange(users.shape[0])\n",
    "one_hot_age_encoding[u_idx,age_binned-1] = 1\n",
    "one_hot_age_encoding.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.12187759131293188"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Z, Gamma = make_user_item_embeddings(one_hot_region_encoding, confounding_strength=10., item_confounding=10., M=1000)\n",
    "N = users.shape[0]\n",
    "Y_past = poisson.rvs(Z.dot(Gamma.T))\n",
    "Y_past.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.16176061204343534"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "N = users.shape[0]\n",
    "prob=0.5\n",
    "random_item_embeddings = make_embeddings(K=5, M=1000)\n",
    "random_user_embeddings = make_embeddings(K=5, M=N)\n",
    "mixture_assignments = bernoulli.rvs(prob, size=(N,1000))\n",
    "region_pref = Z.dot(Gamma.T)\n",
    "random_pref = random_user_embeddings.dot(random_item_embeddings.T)\n",
    "rates = mixture_assignments*region_pref + (1-mixture_assignments)*random_pref\n",
    "Y_past = poisson.rvs(rates)\n",
    "Y_past.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((2024, 1000), 0.0978508660728814, (110,), (1000, 8))"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "k=5\n",
    "user_prefs = gamma.rvs(0.1, scale=0.1, size=(N,k))\n",
    "less_pop_items = make_item_embeddings(M=200, K=k, gamma_mean=0.03)\n",
    "\n",
    "Y_other = user_prefs.dot(less_pop_items.T)\n",
    "per_user = Y_other.sum(axis=1)\n",
    "inf_users = np.arange(N)[per_user > 1]\n",
    "influence = np.zeros(N)\n",
    "influence[inf_users] = 1\n",
    "\n",
    "Gamma = np.vstack([Gamma, np.zeros((200, num_regions))])\n",
    "other_dims = np.vstack([np.zeros((800, k)), less_pop_items])\n",
    "Gamma = np.hstack([Gamma, other_dims])\n",
    "Z = np.hstack([Z, user_prefs])\n",
    "\n",
    "Y_past = np.hstack([Y_past, Y_other])\n",
    "Y_past.shape, Y_past.mean(), inf_users.shape, Gamma.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "influence = gamma.rvs(0.002, scale=10., size=N)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.2058889437314906, 0.04377132538988631)"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rate_influence = (influence*A).dot(Y_past)\n",
    "mixture_assignments = bernoulli.rvs(prob, size=(N,1000))\n",
    "rate_pref = mixture_assignments*region_pref + (1-mixture_assignments)*random_pref\n",
    "Y = poisson.rvs(rate_pref + rate_influence)\n",
    "Y.mean(), rate_influence.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.30885348316459993"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "influence = gamma.rvs(0.01, scale=10., size=N)\n",
    "rate_pref = Z.dot(Gamma.T)\n",
    "prob = 0.7\n",
    "inf_by_adj = (influence*A)\n",
    "z = bernoulli.rvs(prob, size=rate_pref.shape)\n",
    "\n",
    "i = influence.argmax()\n",
    "k = np.nonzero(A[:,i])\n",
    "\n",
    "treatment_effect = np.zeros(1)\n",
    "treatment_mask = Y_past.copy()\n",
    "untreated_mask = Y_past.copy()\n",
    "treatment_mask[i,:] = 1\n",
    "untreated_mask[i,:] = 0\n",
    "treated_rate = z[k,:]*rate_pref[k,:] + (1-z[k,:])*((inf_by_adj[k,:]).dot(treatment_mask))\n",
    "untreated_rate = z[k,:]*rate_pref[k,:] + (1-z[k,:])*((inf_by_adj[k,:]).dot(untreated_mask))\n",
    "for j in range(1):\n",
    "    Y_treated = poisson.rvs(treated_rate)\n",
    "    Y_untreated = poisson.rvs(untreated_rate)\n",
    "    treatment_effect[j] = Y_treated.mean() - Y_untreated.mean()\n",
    "treatment_effect.mean()/influence[i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.10182549643260445, 0.0)"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "influence.mean(), treatment_effect.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1988,), (16,))"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "K=20\n",
    "least_influential = (influence <=0.001)\n",
    "sig_influential = influence >= 0.1\n",
    "influence[least_influential].shape, influence[sig_influential].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\tAfter ITERATION: 1\tObjective: -193733.93\tOld objective: -191095.27\tImprovement: -0.01381\n",
      "\tAfter ITERATION: 1\tObjective: -756954.48\tOld objective: -798820.61\tImprovement: 0.05241"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/dhanyasridhar/Documents/social-fake-news/src/model/network_model.py:160: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  improvement = (bound - old_bd) / abs(old_bd)\n",
      "/Users/dhanyasridhar/Documents/social-fake-news/src/model/pmf.py:174: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  improvement = (bound - old_bd) / abs(old_bd)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\tAfter ITERATION: 32\tObjective: -669989.56\tOld objective: -670316.71\tImprovement: 0.00049\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "((2026, 20), (1000, 20))"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "network_model = nm.NetworkPoissonMF(n_components=K, verbose=True)\n",
    "network_model.fit(A)\n",
    "Z_fitted = network_model.Et\n",
    "poisson_mf = pmf.PoissonMF(n_components=K, verbose=True)\n",
    "poisson_mf.fit(Y_past)\n",
    "Gamma_fitted= poisson_mf.Eb.T\n",
    "Z_fitted.shape, Gamma_fitted.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bound: -1907224.8376669937\n",
      "Bound: -1077207.0255316042\n",
      "Bound: -1035159.5921319266\n",
      "Bound: -988273.0781479427\n",
      "Bound: -955053.1571319518\n",
      "Bound: -943615.2834215881\n",
      "Bound: -941179.1784705612\n",
      "Bound: -940157.1302874222\n",
      "Bound: -939471.9344494201\n",
      "-939471.9344494201 -939005.6594632454\n"
     ]
    }
   ],
   "source": [
    "multi_cause_model = causal.CausalInfluenceModel(n_components=K, verbose=True)\n",
    "multi_cause_model.fit(Y, A, Z_fitted, Gamma_fitted, Y_past)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1.373835317237568, 0.04506795697828349)"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "beta1 = multi_cause_model.E_beta\n",
    "random_beta = gamma.rvs(0.002, scale=10., size=N)\n",
    "mse(influence[sig_influential], beta1[sig_influential]), mse(influence[least_influential],beta1[least_influential])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num. nonzero obs: 300503\n",
      "Computing element-wise product of A, Y past...\n",
      "Computing nonzero elements of AY...\n",
      "User embeddings set to 0.0..\n",
      "Bound: -2179363.4446093095\n",
      "User embeddings set to 0.0..\n",
      "Bound: -614878.8481912806\n",
      "User embeddings set to 0.0..\n",
      "Bound: -592979.4207089379\n",
      "User embeddings set to 0.0..\n",
      "Bound: -590485.9813653021\n",
      "User embeddings set to 0.0..\n",
      "Bound: -590008.1855313268\n",
      "User embeddings set to 0.0..\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(0.014170019719143181, 0.05102746374879843)"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "M=1000\n",
    "naive_model = im.PoissonInfluenceModel(n_components=K, verbose=True)\n",
    "naive_model.fit(Y, A, np.zeros((N,K)), np.zeros((M,K)), Y_past)\n",
    "beta_naive = naive_model.E_beta\n",
    "mse(influence[sig_influential], beta_naive[sig_influential]), mse(influence[least_influential],beta_naive[least_influential])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
