{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "31b9b6cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "\n",
    "from sklearn.model_selection import cross_validate\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "from transformers import RobertaTokenizer, RobertaModel\n",
    "\n",
    "from scipy.spatial.distance import cdist\n",
    "from skdim.id import MLE\n",
    "\n",
    "from tqdm import tqdm\n",
    "\n",
    "# Computations of Intrinsic dimension for shorter texts are unstable and we want to avoid them;\n",
    "MINIMAL_STABLE_LENGTH = 47"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9c03e624",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at D:/! Models/roberta-base-cased were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']\n",
      "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    }
   ],
   "source": [
    "model_path = 'D:/! Models/roberta-base-cased'#'<Path to the model>'\n",
    "tokenizer_path = model_path\n",
    "\n",
    "### Loading the model\n",
    "tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)\n",
    "model = RobertaModel.from_pretrained(model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cedd0f0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "### Multilingual case\n",
    "model_path = '.../xlm-base'\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
    "model = AutoModel.from_pretrained(model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "34e093a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def prim_tree(adj_matrix, power=1.0):\n",
    "    infty = np.max(adj_matrix) + 1.0\n",
    "    \n",
    "    dst = np.ones(adj_matrix.shape[0]) * infty\n",
    "    visited = np.zeros(adj_matrix.shape[0], dtype=bool)\n",
    "    ancestor = -np.ones(adj_matrix.shape[0], dtype=int)\n",
    "\n",
    "    v, s = 0, 0.0\n",
    "    for i in range(adj_matrix.shape[0] - 1):\n",
    "        visited[v] = 1\n",
    "        ancestor[dst > adj_matrix[v]] = v\n",
    "        dst = np.minimum(dst, adj_matrix[v])\n",
    "        dst[visited] = infty\n",
    "        \n",
    "        v = np.argmin(dst)\n",
    "        \n",
    "        s += adj_matrix[v][ancestor[v]] ** power\n",
    "    return s.item()\n",
    "\n",
    "def sample_W(W, nSamples, isRandom=True):\n",
    "    n = W.shape[0]\n",
    "    random_indices = np.random.choice(n, size=nSamples, replace=False)\n",
    "    return W[random_indices]\n",
    "\n",
    "def calculate_ph_dim(W, min_points=40, max_points=510, point_jump=20, alpha=1.0, restarts=3, resamples=7):\n",
    "    # Computations for shorter texts are unstable and we want to avoid them\n",
    "    if W.shape[0] < MINIMAL_STABLE_LENGTH: \n",
    "        return np.nan\n",
    "    \n",
    "    m_candidates = []\n",
    "    for i in range(restarts): \n",
    "        test_n = range(min_points, max_points, point_jump)\n",
    "        lengths = []\n",
    "\n",
    "        for n in test_n:\n",
    "            reruns = np.ones(resamples)\n",
    "            for i in range(resamples):\n",
    "                tmp = sample_W(W, n)\n",
    "                reruns[i] = prim_tree(cdist(tmp, tmp), power=alpha)\n",
    "            lengths.append(np.median(reruns))\n",
    "\n",
    "        lengths = np.array(lengths)\n",
    "        x = np.log(np.array(list(test_n)))\n",
    "        y = np.log(lengths)\n",
    "\n",
    "        N = len(x)\n",
    "        m_candidates.append((N * (x * y).sum() - x.sum() * y.sum()) / (N * (x ** 2).sum() - x.sum() ** 2))\n",
    "    m = np.mean(m_candidates)\n",
    "    return alpha / (1 - m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6273310d",
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "Get CLS-tokens for all texts in df[key] Pandas DataSeries (RoBERTa-CLS baseline)\n",
    "Parameters:\n",
    "        df  --- Pandas DataFrame\n",
    "        key --- Name of the column\n",
    "        is_list --- Check if the elements of the df[key] are lists (appears in some data)\n",
    "        \n",
    "Returns:\n",
    "    numpy.array of shape (number_of_texts, size_of_embedding=768)\n",
    "'''\n",
    "\n",
    "def get_cls(df, key='text', is_list=False):\n",
    "    dims = np.zeros((len(df[key]),768))\n",
    "    cnt = 0\n",
    "    for text in tqdm(df[key]):\n",
    "        if is_list:\n",
    "            s = text[0]\n",
    "        else:\n",
    "            s = text\n",
    "        inputs = tokenizer(s.replace('\\n', ' '), truncation=True, max_length=512, return_tensors=\"pt\")\n",
    "        with torch.no_grad():\n",
    "            outp = model(**inputs)\n",
    "        dims[cnt] = outp[0][0].numpy()[0]\n",
    "        cnt += 1\n",
    "    return dims"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "3812c114",
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "Get MLE for one text\n",
    "Parameters:\n",
    "        text  --- text\n",
    "Returns:\n",
    "    real number or NumPy.nan  --- Intrinsic dimension value of the text in the input data\n",
    "                                                    estimated by Maximum Likelihood Estimation method.'''\n",
    "\n",
    "def get_mle_single(text):\n",
    "    inputs = tokenizer(text.replace('\\n', ' '), truncation=True, max_length=512, return_tensors=\"pt\")\n",
    "    with torch.no_grad():\n",
    "        outp = model(**inputs)\n",
    "    mx_points = inputs['input_ids'].shape[1] - 2\n",
    "        \n",
    "    # Computations for shorter texts are unstable and we want to avoid them\n",
    "    if mx_points < MINIMAL_STABLE_LENGTH:     \n",
    "        return np.nan\n",
    "            \n",
    "    return MLE().fit_transform(outp[0][0].numpy()[1:-1])\n",
    "\n",
    "'''\n",
    "Get MLE for all texts in df[key] Pandas DataSeries (MLE method)\n",
    "Parameters:\n",
    "        df  --- Pandas DataFrame\n",
    "        key --- Name of the column\n",
    "        is_list --- Check if the elements of the df[key] are lists (appears in some data)\n",
    "        \n",
    "Returns:\n",
    "    numpy.array of shape (number_of_texts, 1) --- Intrinsic dimension values for all texts in the input data\n",
    "                                                    estimated by Maximum Likelihood Estimation method.\n",
    "'''\n",
    "\n",
    "def get_mle(df, key='text', is_list=False):\n",
    "    dims = []\n",
    "    for s in tqdm(df[key]):\n",
    "        if is_list:\n",
    "            text = s[0]\n",
    "        else:\n",
    "            text = s\n",
    "            \n",
    "        dims.append(get_mle_single(text))\n",
    "    return np.array(dims).reshape(-1, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c64b8c0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "Get PHD for one text\n",
    "Parameters:\n",
    "        text  --- text\n",
    "        alpha --- Parameter alpha for PHD computattion\n",
    "\n",
    "Returns:\n",
    "    real number or NumPy.nan  --- Intrinsic dimension value of the text in the input data\n",
    "                                                    estimated by Persistence Homology Dimension method.'''\n",
    "def get_phd_single(text, alpha=1.0):\n",
    "    inputs = tokenizer(text.replace('\\n', ' '), truncation=True, max_length=512, return_tensors=\"pt\")\n",
    "    with torch.no_grad():\n",
    "        outp = model(**inputs)\n",
    "        \n",
    "    mx_points = inputs['input_ids'].shape[1] - 2        \n",
    "    mn_points = 40\n",
    "    step = ( mx_points - mn_points ) // 7\n",
    "        \n",
    "    return calculate_ph_dim(outp[0][0].numpy()[1:-1],  min_points=mn_points, max_points=mx_points, \\\n",
    "                                     point_jump=step, alpha=alpha)\n",
    "\n",
    "'''\n",
    "Get PHD for all texts in df[key] Pandas DataSeries (PHD method)\n",
    "Parameters:\n",
    "        df  --- Pandas DataFrame\n",
    "        key --- Name of the column\n",
    "        is_list --- Check if the elements of the df[key] are lists (appears in some data)\n",
    "        \n",
    "        alpha --- Parameter alpha for PHD computattion\n",
    "\n",
    "Returns:\n",
    "    numpy.array of shape (number_of_texts, 1) --- Intrinsic dimension values for all texts in the input data\n",
    "                                                    estimated by Persistence Homology Dimension method.\n",
    "'''\n",
    "\n",
    "def get_phd(df, key='text', is_list=False, alpha=1.0):\n",
    "    dims = []\n",
    "    for s in tqdm(df[key]):\n",
    "        if is_list:\n",
    "            text = s[0]\n",
    "        else:\n",
    "            text = s\n",
    "        dims.append(get_phd_single(text, alpha=alpha))\n",
    "\n",
    "    return np.array(dims).reshape(-1, 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07640280",
   "metadata": {},
   "source": [
    "# An example"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f526ccbf",
   "metadata": {},
   "source": [
    "### RoBERTa-CLS baseline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91106694",
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "train_idx, valid_idx, test_idx are indexes of train/validation/evaluation splits\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a3026cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "reddit_data = pd.read_json(\".../Datasets/opt_13b_reddit.jsonl_pp\", lines=True)\n",
    "\n",
    "#Training subset\n",
    "human_cls_train_en = get_cls(reddit_data.iloc[train_idx], 'gold_completion')\n",
    "opt_cls_train_en = get_cls(reddit_data.iloc[train_idx], 'gen_completion',is_list=True)\n",
    "\n",
    "#Validation subset\n",
    "human_cls_dev_en = get_cls(reddit_data.iloc[valid_idx], 'gold_completion')\n",
    "opt_cls_dev_en = get_cls(reddit_data.iloc[valid_idx], 'gen_completion',real=False)\n",
    "\n",
    "#Evaluation subset\n",
    "human_cls_test_en = get_cls(reddit_data.iloc[test_idx], 'gold_completion')\n",
    "opt_cls_test_en =  get_cls(reddit_data.iloc[test_idx], 'gen_completion',real=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "77928a6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "X_train = np.vstack([human_cls_train_en, opt_cls_train_en])\n",
    "X_valid = np.vstack([human_cls_dev_en, opt_cls_dev_en])\n",
    "\n",
    "y_train = np.hstack([np.ones(len(human_cls_train_en)), np.zeros(len(opt_cls_train_en))])\n",
    "y_valid = np.hstack([np.ones(len(human_cls_dev_en)), np.zeros(len(opt_cls_dev_en))])\n",
    "\n",
    "cls = LogisticRegression(penalty='l1', solver='liblinear',  C=100, max_iter=1000).fit(X_train, y_train)\n",
    "\n",
    "print('Train acc.: ', cls.score(X_train, y_train))\n",
    "print('Validation acc.: ', cls.score(X_valid, y_valid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ad81d56",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test = np.vstack([human_cls_test_en, opt_cls_test_en])\n",
    "y_test = np.hstack([np.ones(len(human_cls_test_en)), np.zeros(len(opt_cls_test_en))])\n",
    "\n",
    "print('Test acc.: ', cls.score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e574b0fa",
   "metadata": {},
   "source": [
    "### MLE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "057bf6e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_text = \"Speaking of festivities, there is one day in China that stands unrivaled - the first day of the Lunar New Year, commonly referred to as the Spring Festival. Even if you're generally uninterested in celebratory events, it's hard to resist the allure of the family reunion dinner, a quintessential aspect of the Spring Festival. Throughout the meal, family members raise their glasses to toast one another, expressing wishes for happiness, peace, health, and prosperity in the upcoming year.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "cb7cd3a0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MLE estimation of the Intrinsic dimension of sample text is  11.54934897385235\n"
     ]
    }
   ],
   "source": [
    "print(\"MLE estimation of the Intrinsic dimension of sample text is \", get_mle_single(sample_text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e73041ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "reddit_data = pd.read_json(\".../Datasets/opt_13b_reddit.jsonl_pp\", lines=True)\n",
    "\n",
    "#Training subset\n",
    "human_mle_train_en = get_mle(reddit_data.iloc[train_idx], 'gold_completion')\n",
    "opt_mle_train_en = get_mle(reddit_data.iloc[train_idx], 'gen_completion',is_list=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "98dc6ec5",
   "metadata": {},
   "source": [
    "\n",
    "### PHD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "612be5bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_text = \"Speaking of festivities, there is one day in China that stands unrivaled - the first day of the Lunar New Year, commonly referred to as the Spring Festival. Even if you're generally uninterested in celebratory events, it's hard to resist the allure of the family reunion dinner, a quintessential aspect of the Spring Festival. Throughout the meal, family members raise their glasses to toast one another, expressing wishes for happiness, peace, health, and prosperity in the upcoming year.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "889a7d89",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PHD estimation of the Intrinsic dimension of sample text is  9.328150478663451\n"
     ]
    }
   ],
   "source": [
    "print(\"PHD estimation of the Intrinsic dimension of sample text is \", get_phd_single(sample_text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "239bced6",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Training subset\n",
    "\n",
    "reddit_data = pd.read_json(\".../Datasets/opt_13b_reddit.jsonl_pp\", lines=True)\n",
    "\n",
    "human_phd_train_en = get_phd(reddit_data.iloc[train_idx], 'gold_completion')\n",
    "opt_phd_train_en = get_phd(reddit_data.iloc[train_idx], 'gen_completion',is_list=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4e8209aa",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
