{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "import os\n",
    "import json\n",
    "import copy\n",
    "import sys\n",
    "from collections import defaultdict\n",
    "import itertools\n",
    "import pandas as pd\n",
    "from tqdm.notebook import tqdm\n",
    "import scipy.stats as stats\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pickle\n",
    "import random\n",
    "import seaborn as sns\n",
    "import scipy.stats as stats\n",
    "import pandas as pd\n",
    "#import seaborn as sns\n",
    "import seaborn as sns\n",
    "from matplotlib import rcParams\n",
    "import scipy\n",
    "from scipy import stats\n",
    "import seaborn as sns\n",
    "from scipy import stats\n",
    "import sys\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Each version of the dataset in the form of :\n",
    "\n",
    "$$M = \\begin{bmatrix} &  & M_{1} & M_{2} & \\cdots & M_{|M|} & H_{1} & H_{2} & \\cdots &H_{|H|} \\\\\n",
    "S_1 & u^{S_1}_1  &  &   &   &   &   &   &   &  \\\\\n",
    "S_1 & \\cdots  &  &   &   &   &   &   &   &  \\\\\n",
    "S_1 & u^{S_1}_{|U|}   &  &   &   &   &   &   &   &  \\\\\n",
    " \\cdots &     &  &   &   &   &   &   &   &  \\\\\n",
    "  \\cdots &    &  &   &   &   &   &   &   &  \\\\\n",
    "S_{|S|} & u^{S_{|S|}}_1  &  &   &   &   &   &   &   &  \\\\\n",
    "S_{|S|} & \\cdots  &  &   &   &   &   &   &   &  \\\\\n",
    "S_{|S|} & u^{S_{|S|}}_{|U|}   &  &   &   &   &   &   &   &  \\\\\n",
    "\\end{bmatrix}$$\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Explore Dataset and Check Correlation with Literature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_correlation(df,human,metric):\n",
    "    \"\"\"\n",
    "    df: considered matrix M\n",
    "    human: human annotation name to compute correlation with \n",
    "    metric: metric name to compute correlation with \n",
    "    \"\"\"\n",
    "    ##################\n",
    "    ## System Level ## \n",
    "    ##################\n",
    "    hum_metric = df[human].groupby('System').mean().values\n",
    "    sys_metric = df[metric].groupby('System').mean().values\n",
    "    final_dic = {'$S_\\\\rho$':stats.pearsonr(hum_metric, sys_metric)[0],\n",
    "                 '$S_p$':stats.spearmanr(hum_metric, sys_metric)[0],\n",
    "                 '$S_\\\\tau$':stats.kendalltau(hum_metric, sys_metric)[0],\n",
    "                '$T_\\\\rho$':[],'$T_p$':[],'$T_\\\\tau$':[]}\n",
    "    \n",
    "    ################\n",
    "    ## Text Level ## \n",
    "    ################\n",
    "    for (_, df_h),(_,df_m) in zip(df[human].groupby(level=0),df[metric].groupby(level=0)):\n",
    "        if stats.pearsonr(df_h.values, df_m.values)[1] < 0.05:\n",
    "                final_dic['$T_p$'].append(stats.pearsonr(df_h.values, df_m.values)[0])\n",
    "        if stats.spearmanr(df_h.values, df_m.values)[1] < 0.05:\n",
    "                final_dic['$T_\\\\rho$'].append(stats.spearmanr(df_h.values, df_m.values)[0])\n",
    "        if scipy.stats.kendalltau(df_h.values, df_m.values)[1] < 0.05:\n",
    "                final_dic['$T_\\\\tau$'].append(stats.kendalltau(df_h.values, df_m.values)[0])\n",
    "    final_dic['$T_p$'] = sum(final_dic['$T_p$'])/len(final_dic['$T_p$'])\n",
    "    final_dic['$T_\\\\rho$'] = sum(final_dic['$T_\\\\rho$'])/len(final_dic['$T_\\\\rho$'])\n",
    "    final_dic['$T_\\\\tau$'] = sum(final_dic['$T_\\\\tau$'])/len(final_dic['$T_\\\\tau$'])\n",
    "    \n",
    "    return final_dic"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Check TAC from https://arxiv.org/pdf/1909.02622.pdf\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "AVAILBLE_DATASETS = ['DIALOG_pc.csv','DIALOG_tc.csv','FLICKR.csv','MLQE.csv','REAL_SUM.csv',\n",
    "                     'SUM_EVAL.csv','TAC_08.csv','TAC_09.csv','TAC_11.csv']\n",
    "data_path = 'final_df'\n",
    "ds_to_load = 'DIALOG_pc.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "count = 0\n",
    "for ds in AVAILBLE_DATASETS:\n",
    "    print('--------------------------------------')\n",
    "    print( 'Loading DS',ds)\n",
    "    considered_df = pd.read_csv(os.path.join(data_path,ds)).set_index(['System','Utterance'])\n",
    "    print(considered_df.shape)\n",
    "    count += considered_df.shape[0]*considered_df.shape[1]\n",
    "    is_metric = ['H:' in i for i in considered_df.columns]\n",
    "    human_metrics = []\n",
    "    for index,i in enumerate(considered_df.columns):\n",
    "        if is_metric[index]: \n",
    "            print('Human',i)\n",
    "            human_metrics.append(i)\n",
    "        else:\n",
    "            pass\n",
    "            print('Metric',i)\n",
    "print('Number of points',count)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ranking Methods"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def scores_by_utterances(df, metric):\n",
    "    # get scores by utterances\n",
    "    # return array of size (nb of utterances, nb of systems), \n",
    "    # where row i contains scores of systems for utterance i.\n",
    "    # Note for the future: easier to use pd.IndexSlice instead\n",
    "    # of the loops.\n",
    "    \n",
    "    \n",
    "    utterances = []\n",
    "    scores = []\n",
    "    for sys, ut in df.index:\n",
    "        if ut not in utterances:\n",
    "            utterances.append(ut)\n",
    "            scores.append([])\n",
    "        n_ut = utterances.index(ut)\n",
    "        scores[n_ut].append(df.loc[(sys, ut)][metric])   \n",
    "    return np.array(scores)\n",
    "\n",
    "def rankings_by_utterances(df, metric):\n",
    "    # return array of size (nb of utterances, nb of systems),\n",
    "    # where row i corresponds to the ranking of systems according \n",
    "    # to their perfomances on utterance i.\n",
    "    # NB: the ranking is in decreasing order meaning that\n",
    "    # [40, 3, 2, ...] reads: S0 is ranked 40, S1 is ranked 3, etc\n",
    "    \n",
    "    # get scores\n",
    "    scores = scores_by_utterances(df, metric)\n",
    "    \n",
    "    # compute rankings\n",
    "    rankings = []\n",
    "    for score in scores:\n",
    "        r = np.argsort(np.argsort(score))\n",
    "        rankings.append(r)\n",
    "    return np.array(rankings)\n",
    "\n",
    "def borda_aggregation_metric(df, metric):\n",
    "    rankings = np.array(rankings_by_utterances(df, metric))\n",
    "    # each row of rankings is an utterance, each column is a system\n",
    "    # we sum the ranks obtained by each system, that is along axis 0\n",
    "    borda_count = rankings.sum(axis=0)  # array of size nb_systems with cumulated ranks\n",
    "    borda_aggr = np.argsort(np.argsort(borda_count))\n",
    "    return borda_aggr\n",
    "\n",
    "def two_levels_aggregation(df):\n",
    "    # return the rankings of the systems \n",
    "    # [7, 2, ...] reads S0 is ranked 7th, S1 is ranked 2nd, etc\n",
    "    # rmk: the best system is the argmin\n",
    "    \n",
    "    rankings_by_metrics = []\n",
    "    for metric in df.columns:\n",
    "        print(metric)\n",
    "        rankings_by_metrics.append(borda_aggregation_metric(df, metric))\n",
    "    r = np.array(rankings_by_metrics)\n",
    "    borda_count = r.sum(axis=0)\n",
    "    borda_aggr = np.argsort(np.argsort(borda_count))\n",
    "    return borda_aggr\n",
    "\n",
    "def direct_aggregation(df):\n",
    "    utterances = np.unique(np.array(df.index.get_level_values(1)))\n",
    "    rankings = []\n",
    "    for ut in utterances:\n",
    "        for metric in df.columns:\n",
    "            scores_ut_metric = df.loc[pd.IndexSlice[:, ut], metric].values\n",
    "            ranks = np.argsort(np.argsort(scores_ut_metric))\n",
    "            rankings.append(ranks)\n",
    "    rankings = np.array(rankings)\n",
    "    borda_count = rankings.sum(axis=0)\n",
    "    borda_aggr = np.argsort(np.argsort(borda_count))\n",
    "    return borda_aggr\n",
    "\n",
    "def interection_length(list1, list2):\n",
    "    return len(list(set(list1).intersection(list2)))\n",
    "\n",
    "\n",
    "def mean_aggregation(df):\n",
    "    # return the rankings of the systems when considering the mean\n",
    "    means = df.groupby('System').mean().mean(axis=1).values\n",
    "    return np.argsort(np.argsort(means)),means\n",
    "\n",
    "def reverse_ranking(sigma):  \n",
    "    # from permutation to ranking\n",
    "    if not isinstance(sigma, list):\n",
    "        sigma = sigma.tolist()\n",
    "    final_ranking = []\n",
    "    for i in range(len(sigma)):\n",
    "        final_ranking.append(sigma.index(i))\n",
    "    return final_ranking"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Analysis of the methods"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Real world perturbation experiments : on metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_metric_experiments(number_of_samples,path,ds,seed=42):\n",
    "    considered_df = pd.read_csv(os.path.join(path,ds)).set_index(['System','Utterance'])\n",
    "    considered_columns = list(considered_df.columns)\n",
    "    random.seed(seed)\n",
    "    final_dic = {\n",
    "        'mean':{},\n",
    "        'direct':{},\n",
    "        '2steps':{}\n",
    "    }\n",
    "    for i in range(1,len(considered_columns)+1):\n",
    "        final_dic['mean']['{}'.format(i)] = []\n",
    "        final_dic['direct']['{}'.format(i)] = []\n",
    "        final_dic['2steps']['{}'.format(i)] = []\n",
    "\n",
    "    for _ in tqdm(range(number_of_samples),'Nbs of Experiments'):\n",
    "        random.shuffle(considered_columns)\n",
    "        for i in tqdm(range(1,len(considered_columns)+1),'Running For DF'):\n",
    "            running_df =  considered_df[considered_columns[:i]]\n",
    "            c_mean_aggreg = mean_aggregation(running_df)[0]\n",
    "            c_direct_aggreg = direct_aggregation(running_df)\n",
    "            c_two_level_aggreg = two_levels_aggregation(running_df)\n",
    "            final_dic['mean']['{}'.format(i)].append(c_mean_aggreg)\n",
    "            final_dic['direct']['{}'.format(i)].append(c_direct_aggreg)\n",
    "            final_dic['2steps']['{}'.format(i)].append(c_two_level_aggreg)\n",
    "            \n",
    "    return final_dic\n",
    "\n",
    "def compute_distances(final_dic,path,ds):\n",
    "    considered_df = pd.read_csv(os.path.join(path,ds)).set_index(['System','Utterance'])\n",
    "    scores_to_plot = {\n",
    "    'criterion': [],\n",
    "    'method': [],\n",
    "    'correlation': [],\n",
    "        'kendallTau': [],\n",
    "    }\n",
    "    ref_ranks = {\n",
    "         'mean':mean_aggregation(considered_df)[0],\n",
    "        'direct': direct_aggregation(considered_df),\n",
    "        '2steps': two_levels_aggregation(considered_df)\n",
    "    }\n",
    "    for k,v in final_dic.items(): \n",
    "        ref_rank = ref_ranks[k]\n",
    "        for sub_k,sub_v in tqdm(v.items(),'Itterations'):\n",
    "            for score in sub_v:\n",
    "                scores_to_plot['criterion'].append(int(sub_k)/considered_df.shape[1])\n",
    "                scores_to_plot['method'].append(k)\n",
    "                scores_to_plot['correlation'].append(stats.kendalltau(score, ref_rank)[0])\n",
    "                scores_to_plot['kendallTau'].append(kendallTau(score, ref_rank))\n",
    "    return scores_to_plot\n",
    "\n",
    "import itertools\n",
    "\n",
    "def kendallTau(A, B):\n",
    "    pairs = itertools.combinations(range(0, len(A)), 2)\n",
    "\n",
    "    distance = 0\n",
    "\n",
    "    for x, y in pairs:\n",
    "        a = A[x] - A[y]\n",
    "        b = B[x] - B[y]\n",
    "\n",
    "        # if discordant (different signs)\n",
    "        if (a * b < 0):\n",
    "            distance += 1\n",
    "\n",
    "    return distance\n",
    "\n",
    "def plot_and_save(df,considered_ds,title):\n",
    "    sns.set_palette(\"Set2\")\n",
    "    fig,ax = plt.subplots(figsize=(6,6))\n",
    "    sns.lineplot(x=\"criterion\", y=\"correlation\", hue=\"method\",style=\"method\",data=df_to_plot,linewidth = 5,ci=100)\n",
    "    plt.yticks(fontsize=20) #hue=\"losses\",\n",
    "    plt.xticks(fontsize=20) #hue=\"losses\",\n",
    "    #plt.title('{}'.format(title),fontsize=30)\n",
    "    plt.ylabel('$\\\\tau$',fontsize=30)\n",
    "    plt.xlabel('% criterion',fontsize=25)\n",
    "    plt.tight_layout()\n",
    "    L=plt.legend(fontsize=23)\n",
    "    for line in L.get_lines():\n",
    "        line.set_linewidth(5.0)\n",
    "    L.get_texts()[0].set_text('Aggreg.')\n",
    "    L.get_texts()[1].set_text('$\\\\sigma^{mean}$')\n",
    "    L.get_texts()[2].set_text('$\\\\sigma^{l}$')\n",
    "    L.get_texts()[3].set_text('$\\\\sigma^{2l}$')\n",
    "\n",
    "    #N = 6\n",
    "    #ax.set_yticks(np.round(np.linspace(0.76, 1, N), 2))\n",
    "\n",
    "    N = 5\n",
    "    ax.set_xticks(np.round(np.linspace(0, 1, N), 2))\n",
    "    plt.savefig('two_level_ranking_{}.pdf'.format(considered_ds),format='pdf')\n",
    "    plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Available ds',AVAILBLE_DATASETS)\n",
    "ds2title = {\n",
    "    \n",
    "    'DIALOG_pc.csv':'PersonaChat',\n",
    " 'DIALOG_tc.csv':'TopicChat',\n",
    " 'FLICKR.csv':'FLICKR',\n",
    " 'MLQE.csv':'MLQE',\n",
    " 'REAL_SUM.csv':'RealSum',\n",
    " 'SUM_EVAL.csv':'SumEval',\n",
    " 'TAC_08.csv':'TAC08',\n",
    " 'TAC_09.csv':'TAC09',\n",
    " 'TAC_11.csv':'TAC11'\n",
    "    \n",
    "}\n",
    "GENERATE_DATA_ROBUTNESS_SYSTEM  = False"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Generating Examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "path = 'final_df'\n",
    "number_of_samples=50\n",
    "if GENERATE_DATA_ROBUTNESS_SYSTEM:\n",
    "    for ds in AVAILBLE_DATASETS:\n",
    "        title = ds2title[ds]\n",
    "        synthetic_metric_scores =  generate_metric_experiments(number_of_samples,path,ds)\n",
    "        file = 'synthetic_metric_scores_{}_{}.json'.format(ds,number_of_samples)\n",
    "        with open('synthetic_metrics_scores/{}'.format(file),'w', encoding='utf-8') as file:\n",
    "            for k,v in synthetic_metric_scores.items():\n",
    "                for sub_k,sub_v in v.items():\n",
    "                    try:\n",
    "                        synthetic_metric_scores[k][sub_k] = [i.tolist() for i in sub_v]\n",
    "                    except: \n",
    "                        print('Error')\n",
    "            json.dump(synthetic_metric_scores,file)\n",
    "        scores_to_plot = compute_distances(synthetic_metric_scores,path,ds)\n",
    "        df_to_plot = pd.DataFrame(scores_to_plot) \n",
    "        plot_and_save(df_to_plot,ds,title)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Loading data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for considered_ds in AVAILBLE_DATASETS:\n",
    "    number_of_samples=50\n",
    "    path = 'final_df'\n",
    "    file = 'synthetic_metric_scores_{}_{}.json'.format(considered_ds,number_of_samples)\n",
    "\n",
    "    with open('synthetic_metrics_scores/{}'.format(file),'r', encoding='utf-8') as file:\n",
    "        file_results = json.load(file)\n",
    "    scores_to_plot = compute_distances(file_results,path,considered_ds)\n",
    "    df_to_plot = pd.DataFrame(scores_to_plot)   \n",
    "    plot_and_save(df_to_plot,considered_ds,'')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Real world perturbation experiments : on systems"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "GENERATE_DATA_ROBUTNESS_SYSTEM= False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def generate_system_experiment(number_of_samples,path= 'final_df',ds=AVAILBLE_DATASETS[6],seed=42):\n",
    "    random.seed(seed)\n",
    "    l_final_dic = []\n",
    "    data_df = pd.read_csv(os.path.join(path,ds)).set_index(['System','Utterance'])\n",
    "    number_of_systems = len(copy.deepcopy(list(set(data_df.reset_index().System.values)))) +1\n",
    "    for number_of_systems in tqdm(range(1,number_of_systems + 1),'Number of Systems'): # start at 2\n",
    "        for _ in tqdm(range(number_of_samples),'Experiments'):\n",
    "            data_df = pd.read_csv(os.path.join(path,ds)).set_index(['System','Utterance'])\n",
    "            systems = copy.deepcopy(list(set(data_df.reset_index().System.values)))\n",
    "            random.shuffle(systems)\n",
    "            df = data_df[data_df.index.isin(systems, level=0)]\n",
    "            df = df.reindex(systems, axis=0, level=0)\n",
    "            ref_ranks = {\n",
    "             'mean':mean_aggregation(df)[0][:number_of_systems],\n",
    "            'direct': direct_aggregation(df)[:number_of_systems],\n",
    "            '2steps': two_levels_aggregation(df)[:number_of_systems]\n",
    "                    }\n",
    "            selected_systems = systems[:number_of_systems]\n",
    "            running_df = df[df.index.isin(selected_systems, level=0)]\n",
    "            running_df_new = running_df.reindex(selected_systems, axis=0, level=0)\n",
    "\n",
    "\n",
    "            runing_ranks = {\n",
    "             'mean':mean_aggregation(running_df)[0],\n",
    "            'direct': direct_aggregation(running_df),\n",
    "            '2steps': two_levels_aggregation(running_df)\n",
    "                    }\n",
    "\n",
    "            final_dic = {}\n",
    "\n",
    "            for r_key in ref_ranks.keys():\n",
    "                    final_dic['{}'.format(r_key)] = stats.kendalltau(ref_ranks[r_key], runing_ranks[r_key])[0]\n",
    "            l_final_dic.append(final_dic)  \n",
    "    return l_final_dic"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Generating Examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "number_of_samples= 50\n",
    "if GENERATE_DATA_ROBUTNESS_SYSTEM:\n",
    "    for ds in tqdm(AVAILBLE_DATASETS,'datasets'):\n",
    "        l_final_dic=  generate_system_experiment(number_of_samples,path= 'final_df',ds=ds,seed=42)\n",
    "        df_dic = {\n",
    "            'type':[],\n",
    "            'correlation':[],\n",
    "            'nbs':[]\n",
    "        }\n",
    "        for index,c_dic in enumerate(l_final_dic):\n",
    "            index_runing=index // number_of_samples \n",
    "            for k,v in c_dic.items():\n",
    "                df_dic['nbs'].append(index_runing)\n",
    "                df_dic['type'].append(k)\n",
    "                df_dic['correlation'].append(v)\n",
    "        with open('final_dic_ajout_de_systems_{}.json'.format(ds),'w') as file:\n",
    "            json.dump(l_final_dic,file)\n",
    "        plt.figure(figsize=(12,8))\n",
    "        df_to_plot = pd.DataFrame(df_dic)\n",
    "        sns.lineplot(x=\"nbs\", y=\"correlation\", hue=\"type\",data=df_to_plot)\n",
    "        plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Loading Examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "ds = 'TAC_11.csv'\n",
    "with open('final_dic_testsize_{}.json'.format(ds),'r') as file:\n",
    "        df_dic = json.load(file)\n",
    "plt.figure(figsize=(12,8))\n",
    "df_to_plot = pd.DataFrame(df_dic)\n",
    "sns.lineplot(x=\"nbs\", y=\"correlation\", hue=\"type\",data=df_to_plot)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Real world perturbation experiments : on systems test size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_testsize_experiment(number_of_samples,path= 'final_df',ds=AVAILBLE_DATASETS[6],seed=42):\n",
    "    random.seed(seed)\n",
    "    l_final_dic = []\n",
    "    data_df = pd.read_csv(os.path.join(path,ds)).set_index(['System','Utterance'])\n",
    "    nb_of_systems = len(list(set(data_df.reset_index().System.values)))\n",
    "    assert data_df.shape[0] % nb_of_systems == 0\n",
    "    number_of_instance = data_df.shape[0] // nb_of_systems \n",
    "    final_ranks = {\n",
    "             'mean':mean_aggregation(data_df)[0],\n",
    "            'direct': direct_aggregation(data_df),\n",
    "            '2steps': two_levels_aggregation(data_df)\n",
    "                    }\n",
    "\n",
    "\n",
    "    utterances = list(set(data_df.reset_index().Utterance.values))\n",
    "    for i in tqdm(range(1,number_of_instance+1),'Number of Instances'): # start at 1\n",
    "        for _ in tqdm(range(number_of_samples),'Experiments'):\n",
    "            random.shuffle(utterances)\n",
    "            newindex = sorted(data_df.index, key=lambda x: utterances.index(x[1]))\n",
    "            newindex = sorted(newindex, key=lambda tup: tup[0]) \n",
    "            df = data_df.reindex(newindex,axis = 0)\n",
    "            df = df.loc[df.index.get_level_values(1).isin(utterances[:i])]\n",
    "            runing_ranks = {\n",
    "             'mean':mean_aggregation(df)[0],\n",
    "            'direct': direct_aggregation(df),\n",
    "            '2steps': two_levels_aggregation(df)\n",
    "                    }\n",
    "            final_dic = {}\n",
    "            for r_key in runing_ranks.keys():\n",
    "                    final_dic['{}'.format(r_key)] = stats.kendalltau(final_ranks[r_key], runing_ranks[r_key])[0]\n",
    "            l_final_dic.append(final_dic)  \n",
    "    return l_final_dic\n",
    "def plot_and_save_test(df,considered_ds,title):\n",
    "    sns.set_palette(\"Set2\")\n",
    "    fig,ax = plt.subplots(figsize=(6,6))\n",
    "    sns.lineplot(x=\"nbs\", y=\"correlation\", hue=\"type\",style=\"type\",data=df_to_plot,linewidth = 5,ci=100)\n",
    "    plt.yticks(fontsize=20) #hue=\"losses\",\n",
    "    plt.xticks(fontsize=20) #hue=\"losses\",\n",
    "    #plt.title('{}'.format(title),fontsize=30)\n",
    "    plt.ylabel('$\\\\tau$',fontsize=30)\n",
    "    plt.xlabel('% size',fontsize=25)\n",
    "    plt.tight_layout()\n",
    "    L=plt.legend(fontsize=23)\n",
    "    for line in L.get_lines():\n",
    "        line.set_linewidth(5.0)\n",
    "    L.get_texts()[0].set_text('Aggreg.')\n",
    "    L.get_texts()[1].set_text('$\\\\sigma^{mean}$')\n",
    "    L.get_texts()[2].set_text('$\\\\sigma^{l}$')\n",
    "    L.get_texts()[3].set_text('$\\\\sigma^{2l}$')\n",
    "\n",
    "    #N = 6\n",
    "    #ax.set_yticks(np.round(np.linspace(0.76, 1, N), 2))\n",
    "\n",
    "    N = 5\n",
    "    ax.set_xticks(np.round(np.linspace(0, 1, N), 2))\n",
    "    plt.savefig('test_two_level_ranking_{}.pdf'.format(considered_ds),format='pdf')\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Generating Examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "GENERATE_DATA_ROBUTNESS_TEST = False\n",
    "if GENERATE_DATA_ROBUTNESS_TEST:\n",
    "    number_of_samples = 50\n",
    "    long = [i for i in AVAILBLE_DATASETS if i not in ['DIALOG_pc.csv','FLICKR.csv','SUM_EVAL.csv','TAC_08.csv']] # 5 are enough\n",
    "    for ds in tqdm(long):\n",
    "        print(ds)\n",
    "        l_final_dic=generate_testsize_experiment(number_of_samples,path= 'final_df',ds=ds,seed=42)\n",
    "        df_dic = {\n",
    "            'type':[],\n",
    "            'correlation':[],\n",
    "            'nbs':[]\n",
    "        }\n",
    "        for index,c_dic in enumerate(l_final_dic):\n",
    "            index_runing=index // number_of_samples \n",
    "            for k,v in c_dic.items():\n",
    "                df_dic['nbs'].append(index_runing / (len(l_final_dic) //number_of_samples ))\n",
    "                df_dic['type'].append(k)\n",
    "                df_dic['correlation'].append(v)\n",
    "        with open('final_dic_testsize_{}_{}.json'.format(ds,number_of_samples),'w') as file:\n",
    "            json.dump(df_dic,file)\n",
    "        print('Considered ds',ds)\n",
    "        df_to_plot = pd.DataFrame(df_dic)\n",
    "        plot_and_save_test(df_to_plot,considered_ds,'')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Loading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "number_of_samples = 50\n",
    "for ds in AVAILBLE_DATASETS[-1:]:\n",
    "    if True:#ds not in long:\n",
    "        print(ds)\n",
    "        with open('final_dic_testsize_{}_{}.json'.format(ds,number_of_samples),'r') as file:\n",
    "            l_final_dic= json.load(file)\n",
    "        df_to_plot = pd.DataFrame(l_final_dic)\n",
    "        plot_and_save_test(df_to_plot,ds,'')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Figure 4 + Appendix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def analyse_agreements(data_path, type_of_agreement=1,aggrement = False):\n",
    "    final_to_plot = {\n",
    "       'ds':[],\n",
    "        'is_1':[], #is equal\n",
    "        'is_3':[],\n",
    "        'is_5':[],\n",
    "        'is_10':[],\n",
    "        'tau':[],\n",
    "        'method':[],\n",
    "        'number':[],\n",
    "        \"agreement\":[],\n",
    "        'moyen_kendall':[],\n",
    "        }\n",
    "    max_number_of_systems= 20\n",
    "    #for i in range(max_number_of_systems):\n",
    "    #    final_to_plot['{}'.format(i)] = []\n",
    "    method = {\n",
    "        \"1\":\"mean22l\",\n",
    "        \"2\":'direct22l',\n",
    "        \"0\":'mean2direct'\n",
    "    }\n",
    "    for dataset in tqdm(AVAILBLE_DATASETS,'Datasets'):\n",
    "        assert type_of_agreement in [1,-1]\n",
    "        considered_df = pd.read_csv(os.path.join(data_path,dataset)).set_index(['System','Utterance'])\n",
    "        # create rankings\n",
    "\n",
    "        mean_aggreg = reverse_ranking(mean_aggregation(considered_df)[0])\n",
    "        direct_aggreg = reverse_ranking(direct_aggregation(considered_df))\n",
    "        two_level_aggreg = reverse_ranking(two_levels_aggregation(considered_df))\n",
    "        comparizons = [(mean_aggreg,direct_aggreg),(direct_aggreg,two_level_aggreg),(mean_aggreg,two_level_aggreg)]\n",
    "        for index, (a,b) in enumerate(comparizons):\n",
    "            \n",
    "            \n",
    "            if type_of_agreement == 1:\n",
    "                for i in range(1,max_number_of_systems):\n",
    "\n",
    "                    \n",
    "                    element_a = [j for j in a[-i:] if j in b[-i:]]\n",
    "                    element_b = [j for j in b[-i:] if j in a[-i:]]\n",
    "                    assert len(element_a) == len(element_b)\n",
    "                    \n",
    "                    if len(element_a) > 1 or aggrement:\n",
    "                        final_to_plot['moyen_kendall'].append(stats.kendalltau(element_a, element_b)[0])\n",
    "                    \n",
    "                        final_to_plot['method'].append(method[str(index)])\n",
    "                        final_to_plot['ds'].append(dataset)\n",
    "                        final_to_plot['number'].append(i)\n",
    "                        final_to_plot['agreement'].append(interection_length(a[-i:], b[-i:])/i)\n",
    "\n",
    "                        #final_to_plot['3'].append(interection_length(a[-3:], b[-3:]))\n",
    "                        #final_to_plot['5'].append(interection_length(a[-5:], b[-5:]))\n",
    "                        #final_to_plot['10'].append(interection_length(a[-10:], b[-10:]))\n",
    "                        final_to_plot['is_1'].append(a[-1 ] == b[-1:])\n",
    "                        final_to_plot['is_3'].append(a[-3:] == b[-3:])\n",
    "                        final_to_plot['is_5'].append(a[-5:] == b[-5:])\n",
    "                        final_to_plot['is_10'].append(a[-10:] == b[-10:])\n",
    "                        final_to_plot['tau'].append(stats.kendalltau(a, b)[0] )\n",
    "\n",
    "            else :\n",
    "                for i in range(max_number_of_systems):\n",
    "                    final_to_plot['{}'.format(i)].append(interection_length(a[:i], b[:i]))\n",
    "                #final_to_plot['1'].append(interection_length(a[:1], b[:1]))\n",
    "                #final_to_plot['3'].append(interection_length(a[:3], b[:3]))\n",
    "                #final_to_plot['5'].append(interection_length(a[:5], b[:5]))\n",
    "                #final_to_plot['10'].append(interection_length(a[:10], b[:10]))  \n",
    "                final_to_plot['is_1'].append(a[0] == b[0])\n",
    "                final_to_plot['is_3'].append(a[:3] == b[:3])\n",
    "                final_to_plot['is_5'].append(a[:5] == b[:5])\n",
    "                final_to_plot['is_10'].append(a[:10] == b[:10])\n",
    "                final_to_plot['tau'].append(stats.kendalltau(a, b)[0] )\n",
    "    return final_to_plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Global Kendall"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_df[['tau','method',\"ds\"]].groupby(['ds',\"method\"]).mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Agreement Top/Last Systems"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_to_plot=  analyse_agreements(data_path = 'final_df', type_of_agreement=1,aggrement = False) \n",
    "data_df =pd.DataFrame(final_to_plot)\n",
    "plt.figure(figsize=(20,10))\n",
    "sns.set_palette(\"Set2\")\n",
    "data_filter = data_df[data_df.number.isin([int(i) for i in range(0,21,2)])]\n",
    "sns.barplot(data=data_filter, x=\"number\", y=\"moyen_kendall\",hue='method')\n",
    "plt.ylabel('$\\\\tau$',size = 45)\n",
    "leg = plt.legend(fontsize=40)\n",
    "for line in leg.get_lines():\n",
    "    line.set_linewidth(5.0)\n",
    "leg.get_texts()[0].set_text('$\\\\tau(\\\\sigma^{mean},\\\\sigma^{l})$')\n",
    "leg.get_texts()[1].set_text('$\\\\tau(\\\\sigma^{mean},\\\\sigma^{2l})$')\n",
    "leg.get_texts()[2].set_text('$\\\\tau(\\\\sigma^{l},\\\\sigma^{2l})$')\n",
    "plt.yticks(fontsize=40) #hue=\"losses\",\n",
    "plt.xticks(fontsize=40) #hue=\"losses\",\n",
    "plt.xlabel('Top Systems',size = 45)\n",
    "plt.tight_layout()\n",
    "plt.savefig('bar_all_ranking_analysis.pdf',format='pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_to_plot=  analyse_agreements(data_path = 'final_df', type_of_agreement=1,aggrement = False) \n",
    "data_df =pd.DataFrame(final_to_plot)\n",
    "plt.figure(figsize=(20,10))\n",
    "plt.tight_layout()\n",
    "sns.set_palette(\"Set2\")\n",
    "data_filter = data_df[data_df.number.isin([int(i) for i in range(0,21,2)])]\n",
    "data_filter = data_filter[data_filter.ds.isin(['DIALOG_pc.csv','DIALOG_tc.csv'])]\n",
    "sns.barplot(data=data_filter, x=\"number\", y=\"moyen_kendall\",hue='method')\n",
    "plt.ylabel('$\\\\tau$',size = 45)\n",
    "leg = plt.legend(fontsize=40)\n",
    "for line in leg.get_lines():\n",
    "    line.set_linewidth(5.0)\n",
    "leg.get_texts()[0].set_text('$\\\\tau(\\\\sigma^{mean},\\\\sigma^{l})$')\n",
    "leg.get_texts()[1].set_text('$\\\\tau(\\\\sigma^{mean},\\\\sigma^{2l})$')\n",
    "leg.get_texts()[2].set_text('$\\\\tau(\\\\sigma^{l},\\\\sigma^{2l})$')\n",
    "plt.yticks(fontsize=40) #hue=\"losses\",\n",
    "plt.xticks(fontsize=40) #hue=\"losses\",\n",
    "plt.xlabel('Top Systems',size = 45)\n",
    "plt.tight_layout()\n",
    "plt.savefig('bar_dialog_ranking_analysis.pdf',format='pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(20,10))\n",
    "sns.set_palette(\"Set2\")\n",
    "data_filter = data_df[data_df.number.isin([int(i) for i in range(0,21,2)])]\n",
    "data_filter = data_filter[data_filter.ds.isin(['FLICKR.csv'])]\n",
    "plt.tight_layout()\n",
    "sns.barplot(data=data_filter, x=\"number\", y=\"moyen_kendall\",hue='method')\n",
    "plt.ylabel('$\\\\tau$',size = 45)\n",
    "leg = plt.legend(fontsize=40)\n",
    "for line in leg.get_lines():\n",
    "    line.set_linewidth(5.0)\n",
    "leg.get_texts()[0].set_text('$\\\\tau(\\\\sigma^{mean},\\\\sigma^{l})$')\n",
    "leg.get_texts()[1].set_text('$\\\\tau(\\\\sigma^{mean},\\\\sigma^{2l})$')\n",
    "leg.get_texts()[2].set_text('$\\\\tau(\\\\sigma^{l},\\\\sigma^{2l})$')\n",
    "plt.yticks(fontsize=40) #hue=\"losses\",\n",
    "plt.xticks(fontsize=40) #hue=\"losses\",\n",
    "plt.xlabel('Top Systems',size = 45)\n",
    "plt.tight_layout()\n",
    "plt.savefig('bar_flick_ranking_analysis.pdf',format='pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(20,10))\n",
    "sns.set_palette(\"Set2\")\n",
    "data_filter = data_df[data_df.number.isin([int(i) for i in range(0,21,2)])]\n",
    "data_filter = data_filter[data_filter.ds.isin(['MLQE.csv'])]\n",
    "sns.barplot(data=data_filter, x=\"number\", y=\"moyen_kendall\",hue='method')\n",
    "plt.ylabel('$\\\\tau$',size = 45)\n",
    "leg = plt.legend(fontsize=40)\n",
    "for line in leg.get_lines():\n",
    "    line.set_linewidth(5.0)\n",
    "leg.get_texts()[0].set_text('$\\\\tau(\\\\sigma^{mean},\\\\sigma^{l})$')\n",
    "leg.get_texts()[1].set_text('$\\\\tau(\\\\sigma^{mean},\\\\sigma^{2l})$')\n",
    "leg.get_texts()[2].set_text('$\\\\tau(\\\\sigma^{l},\\\\sigma^{2l})$')\n",
    "plt.yticks(fontsize=40) #hue=\"losses\",\n",
    "plt.xticks(fontsize=40) #hue=\"losses\",\n",
    "plt.xlabel('Top Systems',size = 45)\n",
    "plt.tight_layout()\n",
    "plt.savefig('bar_mlqe_ranking_analysis.pdf',format='pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(20,10))\n",
    "sns.set_palette(\"Set2\")\n",
    "data_filter = data_df[data_df.number.isin([int(i) for i in range(0,21,2)])]\n",
    "data_filter = data_filter[data_filter.ds.isin(['TAC_08.csv','TAC_09.csv','TAC_11.csv'])]\n",
    "sns.barplot(data=data_filter, x=\"number\", y=\"moyen_kendall\",hue='method')\n",
    "plt.ylabel('$\\\\tau$',size = 45)\n",
    "leg = plt.legend(fontsize=40)\n",
    "for line in leg.get_lines():\n",
    "    line.set_linewidth(5.0)\n",
    "leg.get_texts()[0].set_text('$\\\\tau(\\\\sigma^{mean},\\\\sigma^{l})$')\n",
    "leg.get_texts()[1].set_text('$\\\\tau(\\\\sigma^{mean},\\\\sigma^{2l})$')\n",
    "leg.get_texts()[2].set_text('$\\\\tau(\\\\sigma^{l},\\\\sigma^{2l})$')\n",
    "plt.yticks(fontsize=40) #hue=\"losses\",\n",
    "plt.xticks(fontsize=40) #hue=\"losses\",\n",
    "plt.xlabel('Top Systems',size = 45)\n",
    "plt.tight_layout()\n",
    "plt.savefig('bar_tac_ranking_analysis.pdf',format='pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "plt.figure(figsize=(20,10))\n",
    "sns.set_palette(\"Set2\")\n",
    "data_filter = data_df[data_df.number.isin([int(i) for i in range(0,21,2)])]\n",
    "data_filter = data_filter[data_filter.ds.isin(['REAL_SUM.csv','SUM_EVAL.csv'])]\n",
    "sns.barplot(data=data_filter, x=\"number\", y=\"moyen_kendall\",hue='method')\n",
    "plt.ylabel('$\\\\tau$',size = 45)\n",
    "leg = plt.legend(fontsize=40)\n",
    "for line in leg.get_lines():\n",
    "    line.set_linewidth(5.0)\n",
    "leg.get_texts()[0].set_text('$\\\\tau(\\\\sigma^{mean},\\\\sigma^{l})$')\n",
    "leg.get_texts()[1].set_text('$\\\\tau(\\\\sigma^{mean},\\\\sigma^{2l})$')\n",
    "leg.get_texts()[2].set_text('$\\\\tau(\\\\sigma^{l},\\\\sigma^{2l})$')\n",
    "plt.yticks(fontsize=40) #hue=\"losses\",\n",
    "plt.xticks(fontsize=40) #hue=\"losses\",\n",
    "plt.xlabel('Top Systems',size = 45)\n",
    "plt.tight_layout()\n",
    "plt.savefig('bar_sum_ranking_analysis.pdf',format='pdf')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Figure 4a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#final_to_plot=  analyse_agreements(data_path = 'final_df', type_of_agreement=1,aggrement = True) \n",
    "data_df =pd.DataFrame(final_to_plot)\n",
    "from matplotlib.ticker import MaxNLocator\n",
    "plt.figure(figsize=(10,10))\n",
    "\n",
    "ax = sns.lineplot(data=data_df, x=\"number\", y=\"agreement\",hue='method',markers=True,ci=50,lw=3.5)\n",
    "ax.xaxis.set_major_locator(MaxNLocator(integer=True))\n",
    "leg = plt.legend(fontsize=30)\n",
    "for line in leg.get_lines():\n",
    "    line.set_linewidth(5.0)\n",
    "leg.get_texts()[0].set_text('Method')\n",
    "leg.get_texts()[1].set_text('$\\\\sigma^{mean},\\\\sigma^{l}$')\n",
    "leg.get_texts()[2].set_text('$\\\\sigma^{mean},\\\\sigma^{2l}$')\n",
    "leg.get_texts()[3].set_text('$\\\\sigma^{l},\\\\sigma^{2l}$')\n",
    "plt.ylabel('Agreement',size = 45)\n",
    "plt.xlabel('Top Systems',size = 45)\n",
    "\n",
    "plt.yticks(fontsize=35) #hue=\"losses\",\n",
    "plt.xticks(fontsize=35) #hue=\"losses\",\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('all_ranking_analysis_agrement.pdf',format='pdf')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Appendix Figures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_to_plot=  analyse_agreements(data_path = 'final_df', type_of_agreement=1,aggrement = True) \n",
    "data_df =pd.DataFrame(final_to_plot)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_df_filtered = data_df[data_df.ds.isin(['DIALOG_pc.csv','DIALOG_tc.csv'])]\n",
    "plt.figure(figsize=(10,10))\n",
    "ax = sns.lineplot(data=data_df_filtered, x=\"number\", y=\"agreement\",hue='method',style='ds', markers=True,ci=50,lw=3.5,markersize=10)\n",
    "ax.xaxis.set_major_locator(MaxNLocator(integer=True))\n",
    "leg = plt.legend(fontsize=20)\n",
    "for line in leg.get_lines():\n",
    "    line.set_linewidth(5.0)\n",
    "leg.get_texts()[0].set_text('Method')\n",
    "leg.get_texts()[1].set_text('$\\\\sigma^{mean},\\\\sigma^{l}$')\n",
    "leg.get_texts()[2].set_text('$\\\\sigma^{mean},\\\\sigma^{2l}$')\n",
    "leg.get_texts()[3].set_text('$\\\\sigma^{l},\\\\sigma^{2l}$')\n",
    "plt.ylabel('Agreement',size = 45)\n",
    "plt.xlabel('Top Systems',size = 45)\n",
    "\n",
    "plt.yticks(fontsize=35) #hue=\"losses\",\n",
    "plt.xticks(fontsize=35) #hue=\"losses\",\n",
    "ax.xaxis.set_major_locator(MaxNLocator(integer=True))\n",
    "plt.tight_layout()\n",
    "plt.savefig('dialog_all_ranking_analysis_agrement.pdf',format='pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_df_filtered = data_df[data_df.ds.isin(['FLICKR.csv'])]\n",
    "plt.figure(figsize=(10,10))\n",
    "ax = sns.lineplot(data=data_df_filtered, x=\"number\", y=\"agreement\",hue='method',style='ds', markers=True,ci=50,lw=3.5,markersize=10)\n",
    "leg = plt.legend(fontsize=30)\n",
    "ax.xaxis.set_major_locator(MaxNLocator(integer=True))\n",
    "for line in leg.get_lines():\n",
    "    line.set_linewidth(5.0)\n",
    "leg.get_texts()[0].set_text('Method')\n",
    "leg.get_texts()[1].set_text('$\\\\sigma^{mean},\\\\sigma^{l}$')\n",
    "leg.get_texts()[2].set_text('$\\\\sigma^{mean},\\\\sigma^{2l}$')\n",
    "leg.get_texts()[3].set_text('$\\\\sigma^{l},\\\\sigma^{2l}$')\n",
    "plt.ylabel('Agreement',size = 45)\n",
    "plt.xlabel('Top Systems',size = 45)\n",
    "\n",
    "plt.yticks(fontsize=35) #hue=\"losses\",\n",
    "plt.xticks(fontsize=35) #hue=\"losses\",\n",
    "plt.tight_layout()\n",
    "plt.savefig('flickr_all_ranking_analysis_agrement.pdf',format='pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_df_filtered = data_df[data_df.ds.isin(['MLQE.csv'])]\n",
    "plt.figure(figsize=(10,10))\n",
    "ax = sns.lineplot(data=data_df_filtered, x=\"number\", y=\"agreement\",hue='method',style='ds', markers=True,ci=50,lw=3.5,markersize=10)\n",
    "leg = plt.legend(fontsize=20)\n",
    "ax.xaxis.set_major_locator(MaxNLocator(integer=True))\n",
    "for line in leg.get_lines():\n",
    "    line.set_linewidth(5.0)\n",
    "leg.get_texts()[0].set_text('Method')\n",
    "leg.get_texts()[1].set_text('$\\\\sigma^{mean},\\\\sigma^{l}$')\n",
    "leg.get_texts()[2].set_text('$\\\\sigma^{mean},\\\\sigma^{2l}$')\n",
    "leg.get_texts()[3].set_text('$\\\\sigma^{l},\\\\sigma^{2l}$')\n",
    "plt.ylabel('Agreement',size = 45)\n",
    "plt.xlabel('Top Systems',size = 45)\n",
    "\n",
    "plt.yticks(fontsize=35) #hue=\"losses\",\n",
    "plt.xticks(fontsize=35) #hue=\"losses\",\n",
    "plt.tight_layout()\n",
    "plt.savefig('mlqe_all_ranking_analysis_agrement.pdf',format='pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_df_filtered = data_df[data_df.ds.isin(['TAC_08.csv','TAC_09.csv','TAC_11.csv'])]\n",
    "plt.figure(figsize=(10,10))\n",
    "ax = sns.lineplot(data=data_df_filtered, x=\"number\", y=\"agreement\",hue='method',style='ds', markers=True,ci=50,lw=3.5,markersize=10)\n",
    "leg = plt.legend(fontsize=20)\n",
    "ax.xaxis.set_major_locator(MaxNLocator(integer=True))\n",
    "for line in leg.get_lines():\n",
    "    line.set_linewidth(5.0)\n",
    "leg.get_texts()[0].set_text('Method')\n",
    "leg.get_texts()[1].set_text('$\\\\sigma^{mean},\\\\sigma^{l}$')\n",
    "leg.get_texts()[2].set_text('$\\\\sigma^{mean},\\\\sigma^{2l}$')\n",
    "leg.get_texts()[3].set_text('$\\\\sigma^{l},\\\\sigma^{2l}$')\n",
    "plt.ylabel('Agreement',size = 45)\n",
    "plt.xlabel('Top Systems',size = 45)\n",
    "\n",
    "plt.yticks(fontsize=35) #hue=\"losses\",\n",
    "plt.xticks(fontsize=35) #hue=\"losses\",\n",
    "plt.tight_layout()\n",
    "plt.savefig('tac_all_ranking_analysis_agrement.pdf',format='pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_df_filtered = data_df[data_df.ds.isin(['REAL_SUM.csv','SUM_EVAL.csv'])]\n",
    "plt.figure(figsize=(10,10))\n",
    "ax = sns.lineplot(data=data_df_filtered, x=\"number\", y=\"agreement\",hue='method',style='ds', markers=True,ci=50,lw=3.5,markersize=10)\n",
    "ax.xaxis.set_major_locator(MaxNLocator(integer=True))\n",
    "leg = plt.legend(fontsize=20)\n",
    "for line in leg.get_lines():\n",
    "    line.set_linewidth(5.0)\n",
    "leg.get_texts()[0].set_text('Method')\n",
    "leg.get_texts()[1].set_text('$\\\\sigma^{mean},\\\\sigma^{l}$')\n",
    "leg.get_texts()[2].set_text('$\\\\sigma^{mean},\\\\sigma^{2l}$')\n",
    "leg.get_texts()[3].set_text('$\\\\sigma^{l},\\\\sigma^{2l}$')\n",
    "plt.ylabel('Agreement',size = 45)\n",
    "plt.xlabel('Top Systems',size = 45)\n",
    "\n",
    "plt.yticks(fontsize=35) #hue=\"losses\",\n",
    "plt.xticks(fontsize=35) #hue=\"losses\",\n",
    "plt.tight_layout()\n",
    "plt.savefig('sum_all_ranking_analysis_agrement.pdf',format='pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
