{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9d4e42f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "# with open(\"/home/liangrenzhao/Data_Contamination/train/omni_math/original/result/Qwen2.5-7B-Instruct_original_logprob.json\", \"r\") as f:\n",
    "#     ori_data = json.load(f)\n",
    "\n",
    "with open(\"\", \"r\") as f:\n",
    "    sft_data = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc92fcf2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "greedy_sample_probs = []\n",
    "samples_probs = []\n",
    "\n",
    "for instance in sft_data:\n",
    "    greedy_sample_probs.append(instance['greedy_sample_prob'])\n",
    "    samples_probs.append(instance['samples_prob'])\n",
    "\n",
    "def get_logprob(list_dict):\n",
    "    logprobs = []\n",
    "    for i in range(1, len(list_dict)):\n",
    "        token_dict = list_dict[i]\n",
    "        logprobs.append(token_dict['logprob'])\n",
    "    \n",
    "    return np.array(logprobs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d524388c",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(sft_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb23f3fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import roc_auc_score\n",
    "\n",
    "Labels = np.zeros(356)\n",
    "Labels[:150] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0caf627",
   "metadata": {},
   "outputs": [],
   "source": [
    "min_samples_probs_temp1[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "199c12a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "min_samples_probs_temp1 = []\n",
    "\n",
    "for idx in range(len(samples_probs)):\n",
    "    idx_samples_probs_temp1 = samples_probs[idx]\n",
    "    min_samples_prob_temp1_list = [np.sum(sorted(get_logprob(lst))[:20])  / len(lst) for lst in idx_samples_probs_temp1]\n",
    "    min_samples_probs_temp1.append(np.std(min_samples_prob_temp1_list))\n",
    "\n",
    "\n",
    "roc_auc_score(Labels, np.array(min_samples_probs_temp1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3eb3b33b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize_code(sample, tokenizer, length=512):\n",
    "    return tokenizer.encode(sample)[:length] if length else tokenizer.encode(sample)\n",
    "\n",
    "def strip_code(sample):\n",
    "    return sample.strip().split('\\n\\n\\n')[0] if '\\n\\n\\n' in sample else sample.strip().split('```')[0]\n",
    "\n",
    "import json\n",
    "from transformers import AutoTokenizer\n",
    "from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score\n",
    "from Levenshtein import distance as levenshtein_distance\n",
    "\n",
    "def evaluate_classification(y_true, y_pred, y_pred_prob=None):\n",
    "    metrics = {\n",
    "        'Precision': precision_score(y_true, y_pred),\n",
    "        'Recall': recall_score(y_true, y_pred),\n",
    "        'Accuracy': accuracy_score(y_true, y_pred),\n",
    "        'F1 Score': f1_score(y_true, y_pred)\n",
    "    }\n",
    "    \n",
    "    if y_pred_prob is not None:\n",
    "        metrics['AUC'] = roc_auc_score(y_true, y_pred_prob)\n",
    "    \n",
    "    return metrics\n",
    "\n",
    "\n",
    "def get_edit_distance_distribution_star(samples, gready_sample, tokenizer, length = 512):\n",
    "    gready_sample = strip_code(gready_sample)\n",
    "    gs = tokenize_code(gready_sample, tokenizer, length)\n",
    "    num = []\n",
    "    max_length = len(gs)\n",
    "    for sample in samples:\n",
    "        sample = strip_code(sample)\n",
    "        s = tokenize_code(sample, tokenizer, length)\n",
    "        num.append(levenshtein_distance(gs, s))\n",
    "        max_length = max(max_length, len(s))\n",
    "    return num, max_length\n",
    "\n",
    "def calculate_ratio(numbers, alpha=0.05):\n",
    "    count = sum(1 for num in numbers if num <= alpha)\n",
    "    total = len(numbers)\n",
    "    ratio = count / total if total > 0 else 0\n",
    "    return ratio\n",
    "\n",
    "\n",
    "from tqdm import tqdm\n",
    "import numpy as np\n",
    "model_path = \"\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
    "\n",
    "alpha = 0.05\n",
    "xi = 0.01\n",
    "Results=[]\n",
    "Labels = []\n",
    "stds = []\n",
    "for i, task in tqdm(enumerate(sft_data)):  \n",
    "    dist, ml = get_edit_distance_distribution_star(task['samples'], task['greedy_sample'], tokenizer)\n",
    "    dist = np.array(dist)\n",
    "    stds.append(np.std(dist))\n",
    "    peak = calculate_ratio(dist, alpha*ml) \n",
    "    Results.append(peak)\n",
    "    Labels.append(task['label'])\n",
    "\n",
    "metric = evaluate_classification(Labels, [i>xi for i in Results], Results)\n",
    "\n",
    "print(f'Accuracy = {metric[\"Accuracy\"]}')\n",
    "print(f'Precision = {metric[\"Precision\"]}')\n",
    "print(f'Recall = {metric[\"Recall\"]}')\n",
    "print(f'F1Score = {metric[\"F1 Score\"]}')\n",
    "print(f'AUC = {metric[\"AUC\"]}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee4944e8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "CDD",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
