{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pystan\n",
    "from scipy.stats import kendalltau\n",
    "import arviz as az\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('Elicitation Formats/rank-rank/rank-rank_Geography.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert string representations of lists into actual lists\n",
    "import ast  \n",
    "df['options'] = df['options'].apply(ast.literal_eval)\n",
    "df['votes'] = df['votes'].apply(ast.literal_eval)\n",
    "df['predictions'] = df['predictions'].apply(ast.literal_eval)\n",
    "\n",
    "N = 10 # Total number of voters\n",
    "J = df['question'].nunique()  # Total number of questions\n",
    "K = len((df['options'].iloc[0]))  # Number of options per vote, assuming uniform across questions\n",
    "\n",
    "votes_data = np.zeros((N, J, K), dtype=int)\n",
    "predictions_data = np.zeros((N, J, K), dtype=int)\n",
    "kendall_tau_votes = np.zeros((N, J))\n",
    "kendall_tau_predictions = np.zeros((N, J))\n",
    "\n",
    "# Iterate over each question\n",
    "for j in range(J):\n",
    "    question_responses = df[df['question'] == j+1]  # Filter responses for the current question\n",
    "    \n",
    "    for i, response in question_responses.iterrows():\n",
    "        # Calculate Kendall tau distance for the current response\n",
    "        kt_vote, _ = kendalltau(response['votes'], response['options'])\n",
    "        kt_prediction, _ = kendalltau(response['predictions'], response['options'])\n",
    "        \n",
    "        # Find the index of the current voter based on the loop iteration\n",
    "        # This assumes voters are evenly distributed across questions\n",
    "        n = i % N\n",
    "        \n",
    "        # Assign calculated distances\n",
    "        kendall_tau_votes[n, j] = kt_vote\n",
    "        kendall_tau_predictions[n, j] = kt_prediction\n",
    "\n",
    "# Preparing the PyStan data dictionary\n",
    "stan_data = {\n",
    "    'N': N,\n",
    "    'K': K,\n",
    "    'J': J,\n",
    "    'kendall_tau_votes': kendall_tau_votes,\n",
    "    'kendall_tau_predictions': kendall_tau_predictions,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "stan_model_code = \"\"\"\n",
    "data {\n",
    "  int<lower=1> N; // Number of voters\n",
    "  int<lower=1> J; // Number of questions\n",
    "  real kendall_tau_votes[N, J]; // Kendall tau distances for votes\n",
    "  real kendall_tau_predictions[N, J]; // Kendall tau distances for predictions\n",
    "}\n",
    "\n",
    "parameters {\n",
    "  real<lower=0> dispersion_vote_expert; // Dispersion parameter for expert votes\n",
    "  real<lower=0> dispersion_vote_nonexpert; // Dispersion parameter for non-expert votes\n",
    "  real<lower=0> dispersion_pred_expert; // Dispersion parameter for expert predictions\n",
    "  real<lower=0> dispersion_pred_nonexpert; // Dispersion parameter for non-expert predictions\n",
    "  real<lower=0, upper=1> prob_expert; // Probability of being an expert\n",
    "}\n",
    "\n",
    "model {\n",
    "  // Priors\n",
    "  dispersion_vote_expert ~ normal(0.15, 0.075); // More relaxed\n",
    "  dispersion_vote_nonexpert ~ normal(0.7, 0.3); // Adjusted for broader range\n",
    "  dispersion_pred_expert ~ normal(0.7, 0.3); // More relaxed\n",
    "  dispersion_pred_nonexpert ~ normal(0.7, 0.3); // Adjusted for broader range\n",
    "  prob_expert ~ beta(1, 2.5);\n",
    "\n",
    "  for (n in 1:N) {\n",
    "    // Mixing model for expert and non-expert behavior\n",
    "    target += log_mix(prob_expert,\n",
    "                      normal_lpdf(kendall_tau_votes[n] | 0, dispersion_vote_expert) + \n",
    "                      normal_lpdf(kendall_tau_predictions[n] | 0, dispersion_pred_expert),\n",
    "                      normal_lpdf(kendall_tau_votes[n] | 0, dispersion_vote_nonexpert) + \n",
    "                      normal_lpdf(kendall_tau_predictions[n] | 0, dispersion_pred_nonexpert));\n",
    "  }\n",
    "}\n",
    "\"\"\"\n",
    "def init_function():\n",
    "    return {\n",
    "        'dispersion_vote_expert': 0.1, \n",
    "        'dispersion_vote_nonexpert': 0.9, \n",
    "        'dispersion_pred_expert': 1.0, \n",
    "        'dispersion_pred_nonexpert': 1.0, \n",
    "        'prob_expert': 0.1\n",
    "    }\n",
    "\n",
    "\n",
    "\n",
    "sm = pystan.StanModel(model_code=stan_model_code)\n",
    "\n",
    "original_fit = sm.sampling(data=stan_data, iter=4000, warmup=1000, chains=4, init=init_function)\n",
    "az.plot_trace(original_fit, var_names=['dispersion_vote_expert', 'dispersion_pred_expert', 'dispersion_vote_nonexpert', 'dispersion_pred_nonexpert', 'prob_expert'])\n",
    "\n",
    "original_fit.summary()\n",
    "\n",
    "print(original_fit)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# %%\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pystan\n",
    "from scipy.stats import kendalltau\n",
    "import arviz as az\n",
    "import ast \n",
    "\n",
    "df = pd.read_csv('simulated_data.csv')\n",
    "\n",
    "def mallows_distance(ranking1, ranking2):\n",
    "    rank2_indices = {value: idx for idx, value in enumerate(ranking2)}\n",
    "    distance = sum(abs(i - rank2_indices[value]) for i, value in enumerate(ranking1))\n",
    "    return distance\n",
    "\n",
    "\n",
    "df['options'] = df['options'].apply(ast.literal_eval)\n",
    "df['votes'] = df['votes'].apply(ast.literal_eval)\n",
    "df['predictions'] = df['predictions'].apply(ast.literal_eval)\n",
    "\n",
    "N = 10  # Total number of voters\n",
    "J = df['question'].nunique()  # Total number of questions\n",
    "K = len((df['options'].iloc[0]))  # Number of options per vote, assuming uniform across questions\n",
    "\n",
    "# Preparing the data structure for PyStan\n",
    "votes_data = np.zeros((N, J, K), dtype=int)\n",
    "predictions_data = np.zeros((N, J, K), dtype=int)\n",
    "kendall_tau_votes = np.zeros((N, J))\n",
    "kendall_tau_predictions = np.zeros((N, J))\n",
    "\n",
    "for j in range(J):\n",
    "    question_responses = df[df['question'] == j+1]  # Filter responses for the current question\n",
    "    \n",
    "    # Iterate through each response for the current question\n",
    "    for i, response in question_responses.iterrows():\n",
    "        # Calculate Kendall tau distance for the current response\n",
    "        kt_vote, _ = kendalltau(response['votes'], response['options'])\n",
    "        kt_prediction, _ = kendalltau(response['predictions'], response['options'])\n",
    "        \n",
    "        # Find the index of the current voter based on the loop iteration\n",
    "        # This assumes voters are evenly distributed across questions\n",
    "        n = i % N\n",
    "        \n",
    "        # Assign calculated distances\n",
    "        kendall_tau_votes[n, j] = kt_vote\n",
    "        kendall_tau_predictions[n, j] = kt_prediction\n",
    "\n",
    "# Preparing the PyStan data dictionary\n",
    "stan_data = {\n",
    "    'N': N,\n",
    "    'K': K,\n",
    "    'J': J,\n",
    "    'kendall_tau_votes': kendall_tau_votes,\n",
    "    'kendall_tau_predictions': kendall_tau_predictions,\n",
    "}\n",
    "\n",
    "# %%\n",
    "stan_model_code = \"\"\"\n",
    "data {\n",
    "  int<lower=1> N; // Number of voters\n",
    "  int<lower=1> J; // Number of questions\n",
    "  real kendall_tau_votes[N, J]; // Kendall tau distances for votes\n",
    "  real kendall_tau_predictions[N, J]; // Kendall tau distances for predictions\n",
    "}\n",
    "\n",
    "parameters {\n",
    "  real<lower=0> dispersion_vote_expert; // Dispersion parameter for expert votes\n",
    "  real<lower=0> dispersion_vote_nonexpert; // Dispersion parameter for non-expert votes\n",
    "  real<lower=0> dispersion_pred_expert; // Dispersion parameter for expert predictions\n",
    "  real<lower=0> dispersion_pred_nonexpert; // Dispersion parameter for non-expert predictions\n",
    "  real<lower=0, upper=1> prob_expert; // Probability of being an expert\n",
    "}\n",
    "\n",
    "model {\n",
    "  // Priors\n",
    "  dispersion_vote_expert ~ normal(0.15, 0.075); // More relaxed\n",
    "  dispersion_vote_nonexpert ~ normal(0.7, 0.3); // Adjusted for broader range\n",
    "  dispersion_pred_expert ~ normal(0.7, 0.3); // More relaxed\n",
    "  dispersion_pred_nonexpert ~ normal(0.7, 0.3); // Adjusted for broader range\n",
    "  prob_expert ~ beta(1, 2.5);\n",
    "\n",
    "  for (n in 1:N) {\n",
    "    // Mixing model for expert and non-expert behavior\n",
    "    target += log_mix(prob_expert,\n",
    "                      normal_lpdf(kendall_tau_votes[n] | 0, dispersion_vote_expert) + \n",
    "                      normal_lpdf(kendall_tau_predictions[n] | 0, dispersion_pred_expert),\n",
    "                      normal_lpdf(kendall_tau_votes[n] | 0, dispersion_vote_nonexpert) + \n",
    "                      normal_lpdf(kendall_tau_predictions[n] | 0, dispersion_pred_nonexpert));\n",
    "  }\n",
    "}\n",
    "\"\"\"\n",
    "def init_function():\n",
    "    return {\n",
    "        'dispersion_vote_expert': 0.1, \n",
    "        'dispersion_vote_nonexpert': 0.9, \n",
    "        'dispersion_pred_expert': 1.0, \n",
    "        'dispersion_pred_nonexpert': 1.0, \n",
    "        'prob_expert': 0.1\n",
    "    }\n",
    "\n",
    "\n",
    "sm = pystan.StanModel(model_code=stan_model_code)\n",
    "\n",
    "fit = sm.sampling(data=stan_data, iter=4000, warmup=1000, chains=4, init=init_function)\n",
    "az.plot_trace(fit, var_names=['dispersion_vote_expert', 'dispersion_pred_expert', 'dispersion_vote_nonexpert', 'dispersion_pred_nonexpert', 'prob_expert'])\n",
    "\n",
    "fit.summary()\n",
    "\n",
    "print(fit)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import numpy as np\n",
    "import pystan\n",
    "\n",
    "original_params = original_fit.extract(permuted=True)\n",
    "params = fit.extract(permuted=True)\n",
    "\n",
    "# Define the parameter names and their custom titles\n",
    "param_names = [\n",
    "    'dispersion_vote_expert',\n",
    "    'dispersion_pred_expert',\n",
    "    'dispersion_vote_nonexpert',\n",
    "    'dispersion_pred_nonexpert',\n",
    "]\n",
    "\n",
    "custom_titles = {\n",
    "    'dispersion_vote_expert': 'Dispersion Parameter - Expert Votes',\n",
    "    'dispersion_pred_expert': 'Dispersion Parameter - Expert Predictions',\n",
    "    'dispersion_vote_nonexpert': 'Dispersion Parameter - Non-Expert Votes',\n",
    "    'dispersion_pred_nonexpert': 'Dispersion Parameter - Non-Expert Predictions'\n",
    "}\n",
    "\n",
    "# Plot settings\n",
    "sns.set(style=\"white\")  # No grid\n",
    "fig, axes = plt.subplots(1, 4, figsize=(20, 5))  # Subplots in a single row and wide figure size\n",
    "\n",
    "for i, ax in enumerate(axes):\n",
    "    param = param_names[i]\n",
    "    sns.kdeplot(original_params[param], color='blue', linestyle='-', ax=ax, bw_adjust=0.5, label='Original Data' if i == 0 else \"\")\n",
    "    sns.kdeplot(params[param], color='red', linestyle='--', ax=ax, bw_adjust=0.5, label='Synthetic Data' if i == 0 else \"\")\n",
    "    ax.set_xlim(0, 1)  # Set x-axis range from 0 to 1 for all plots\n",
    "    ax.set_title(custom_titles[param], fontsize=17)  # Set custom titles for each plot\n",
    "    ax.set_xlabel('')  # Remove individual x-labels\n",
    "    ax.set_ylabel('')  # Remove individual y-labels\n",
    "    ax.tick_params(axis='x', labelsize=16)  # Adjust font size for xticks\n",
    "    ax.tick_params(axis='y', labelsize=16)  # Adjust font size for yticks\n",
    "\n",
    "# Adding common X-axis and Y-axis labels\n",
    "fig.text(0.5, 0.04, 'Kendall-tau distance from ground truth', ha='center', va='center', fontsize=25)\n",
    "fig.text(0.04, 0.5, 'Density', ha='center', va='center', rotation='vertical', fontsize=25)\n",
    "\n",
    "# Create a common legend for the entire figure\n",
    "fig.legend(loc='upper center', ncol=2,  fontsize=12)\n",
    "plt.tight_layout(rect=[0.05, 0.05, 1, 0.95])  # Adjust layout to fit everything properly\n",
    "plt.show()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
