{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e39c21dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pickle\n",
    "import torch\n",
    "import pandas as pd\n",
    "from sklearn import metrics\n",
    "import tqdm\n",
    "from tqdm import tqdm\n",
    "import seaborn as sns\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy import stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c6eb5555",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir('/makesense_dir_path/data/usim/Markup/UsageSimilarity/')\n",
    "data = pd.read_csv('usim2ratings.csv')\n",
    "data_all = pd.read_csv(\"/makesense_dir_path/data/usim_sents.csv\")\n",
    "data_all.columns = data_all.columns.str.strip()\n",
    "\n",
    "data = data[data.user_id == 'avg']\n",
    "data = data.reset_index(drop = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "2030a21e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def annot_corr(l, data, data_all):\n",
    "    original = torch.load(f'/makesense_dir_path/data/usim/embeddings/original/layer_{l}.pt')\n",
    "    approx = torch.load(f'/makesense_dir_path/data/usim/embeddings/laser/layer_{l}.pt')\n",
    "    \n",
    "    original_ = list(zip(data_all.lexsub_id1, original))\n",
    "    approx_ = list(zip(data_all.lexsub_id1, approx))\n",
    "    \n",
    "    o_cosine_ = []\n",
    "    a_cosine_ = []\n",
    "    for i in tqdm(range(len(data))):\n",
    "        sim1 = metrics.pairwise.cosine_similarity([x[1] for x in original_ if x[0] == data.iloc[i].lexsub_id1][0].reshape(1,-1),\n",
    "                                   [x[1] for x in original_ if x[0] == data.iloc[i].lexsub_id2][0].reshape(1,-1))[0][0]\n",
    "        sim2 = metrics.pairwise.cosine_similarity([x[1] for x in approx_ if x[0] == data.iloc[i].lexsub_id1][0].reshape(1,-1),\n",
    "                                   [x[1] for x in approx_ if x[0] == data.iloc[i].lexsub_id2][0].reshape(1,-1))[0][0]\n",
    "\n",
    "        o_cosine_.append(sim1)\n",
    "        a_cosine_.append(sim2)\n",
    "    \n",
    "    return o_cosine_, a_cosine_\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "df5f3918",
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_corr_(data, l, annot_corr):\n",
    "    ocol = 'ol'+str(l)\n",
    "    acol = 'al'+str(l)\n",
    "    data[ocol] = annot_corr[0]\n",
    "    data[ocol] = data[ocol].astype(float)\n",
    "    data[acol] = annot_corr[1]\n",
    "    data[acol] = data[acol].astype(float)  \n",
    "    \n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "afec15a1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1142/1142 [01:45<00:00, 10.79it/s]\n",
      "100%|██████████| 1142/1142 [01:54<00:00, 10.01it/s]\n",
      "100%|██████████| 1142/1142 [01:53<00:00, 10.05it/s]\n",
      "100%|██████████| 1142/1142 [01:56<00:00,  9.80it/s]\n",
      "100%|██████████| 1142/1142 [02:00<00:00,  9.51it/s]\n",
      "100%|██████████| 1142/1142 [01:58<00:00,  9.61it/s]\n",
      "100%|██████████| 1142/1142 [02:00<00:00,  9.45it/s]\n",
      "100%|██████████| 1142/1142 [02:05<00:00,  9.08it/s]\n",
      "100%|██████████| 1142/1142 [02:04<00:00,  9.18it/s]\n",
      "100%|██████████| 1142/1142 [02:07<00:00,  8.97it/s]\n",
      "100%|██████████| 1142/1142 [02:09<00:00,  8.80it/s]\n",
      "100%|██████████| 1142/1142 [02:11<00:00,  8.68it/s]\n",
      "100%|██████████| 1142/1142 [02:13<00:00,  8.55it/s]\n"
     ]
    }
   ],
   "source": [
    "for l in range(13):\n",
    "    data = add_corr_(data, l, annot_corr(l, data, data_all))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "5a095ab3",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv('/makesense_dir_path/data/annot_sim.csv', index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a272ed34",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_ = data\n",
    "\n",
    "layers = []\n",
    "ocorr = []\n",
    "op = []\n",
    "acorr = []\n",
    "ap = []\n",
    "\n",
    "for l in range(13):\n",
    "    ocol = 'ol'+str(l)\n",
    "    acol = 'al'+str(l)\n",
    "    layers.append(l)\n",
    "    ocorr.append(stats.spearmanr(data_['judgment'], data_[ocol]).correlation)\n",
    "    op.append(stats.spearmanr(data_['judgment'], data_[ocol]).pvalue)\n",
    "    acorr.append(stats.spearmanr(data_['judgment'], data_[acol]).correlation)\n",
    "    ap.append(stats.spearmanr(data_['judgment'], data_[acol]).pvalue)\n",
    "    \n",
    "    \n",
    "corr_df = pd.DataFrame(zip(layers, ocorr, op, acorr, ap), columns = ['layer', 'corr_o', 'p_o', 'corr_a', 'p_a'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65d4ced9",
   "metadata": {},
   "outputs": [],
   "source": [
    "corr_df.to_csv('/makesense_dir_path/analysis/usim_analysis/corr.csv', index = False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
