{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fairness LLM"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For detailed documentation, including API references, tutorials, and best practices, please visit our comprehensive documentation site:\n",
    "\n",
    "[TrustEval Documentation](https://trustgen.github.io/trustgen_docs/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import nest_asyncio\n",
    "nest_asyncio.apply()\n",
    "\n",
    "## conmment this line if you don't need to use proxy\n",
    "os.environ['http_proxy'] = 'http://127.0.0.1:7890' \n",
    "os.environ['https_proxy'] = 'http://127.0.0.1:7890'  \n",
    "\n",
    "parent_dir = os.path.dirname(os.getcwd())\n",
    "sys.path.append(parent_dir)\n",
    "\n",
    "base_dir = os.path.abspath(\"./test/fairness_llm/\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Metadata Curator\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Created directory: d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\n",
      "Downloading dataset for section: fairness_llm\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading...\n",
      "From: https://drive.google.com/uc?id=1tgJcwTOtC-1B7KKB_SrOzJhaTAULzM9u\n",
      "To: d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\tmp.zip\n",
      "100%|██████████| 2.53M/2.53M [00:00<00:00, 2.86MB/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting dataset to: d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\n",
      "Removing temporary zip file: d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\tmp.zip\n",
      "\u001b[92mDataset for section 'fairness_llm' has been downloaded and extracted to 'd:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm'\u001b[0m\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "from trusteval.src.download import download_metadata\n",
    "download_metadata('fairness_llm', base_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating 300 prompts with 1 files\n",
      "Complexities: range(3, 9)\n",
      "Attributes: range(0, 6)\n",
      "d:\\paper\\TrustEval-toolkit\\trusteval\n",
      "Running StereotypeGenerator ...\n",
      "Step 1: Processing all original datasets...\n",
      "metadata\\stereotype_data\\processed\\crows.json have successful generate.\n",
      "metadata\\stereotype_data\\processed\\stereoset_fill.json have successful generate.\n",
      "metadata\\stereotype_data\\processed\\stereoset_complete.json have successful generate.\n",
      "metadata\\stereotype_data\\processed\\bbq.json have successful generate.\n",
      "Step 2: Randomly sampling from processed datasets...\n",
      "metadata\\stereotype_data\\select\\crows_sample.json have successful generate. 40 samples\n",
      "metadata\\stereotype_data\\select\\bbq_sample.json have successful generate. 40 samples\n",
      "metadata\\stereotype_data\\select\\stereoset_fill_sample.json have successful generate. 30 samples\n",
      "metadata\\stereotype_data\\select\\stereoset_complete_sample.json have successful generate. 30 samples\n",
      "Step 3: Generating cases...\n",
      "Crows cases have been generated.\n",
      "successful read d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\metadata\\stereotype_data\\select\\stereoset_fill_sample.json\n",
      "Fill cases have been generated and saved to d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\metadata\\stereotype_data\\cases\\stereoset_fill_cases.json.\n",
      "successful read d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\metadata\\stereotype_data\\select\\stereoset_complete_sample.json\n",
      "Complete cases have been generated and saved to d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\metadata\\stereotype_data\\cases\\stereoset_complete_cases.json.\n",
      "successful read d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\metadata\\stereotype_data\\select\\bbq_sample.json\n",
      "BBQ cases have been generated and saved.\n",
      "Step 4: Merging and reordering all cases...\n",
      "Merged and reordered data saved to LLM_fairness_stereotype.json\n",
      "Total number of cases: 140\n",
      "\u001b[92mAll dataset generation finished.\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "from trusteval.dimension.fairness.fairness_llm import pipeline\n",
    "pipeline.run(base_dir=base_dir,subset=['stereotype','preference', 'disparagement'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Contexual Variator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Successfully copied file_config to d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\file_config.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 140/140 [01:04<00:00,  2.17it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Enhanced data saved to d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\LLM_fairness_stereotype_enhanced.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 211/211 [01:45<00:00,  1.99it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Enhanced data saved to d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\LLM_fairness_disparagement_enhanced.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 15/15 [00:09<00:00,  1.56it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Enhanced data saved to d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\LLM_fairness_preference_enhanced.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "from trusteval import contextual_variator_cli\n",
    "import shutil\n",
    "\n",
    "source_config = os.path.join(parent_dir,\"trusteval\",\"dimension\",\"fairness\",'fairness_llm','file_config_fairness.json')\n",
    "target_config = os.path.join(base_dir,\"file_config.json\")\n",
    "if os.path.exists(source_config):\n",
    "    shutil.copy2(source_config, target_config)\n",
    "    print(f\"Successfully copied file_config to {target_config}\")\n",
    "else:\n",
    "    print(\"Warning: Source file_config not found\")\n",
    "\n",
    "contextual_variator_cli(\n",
    "    dataset_folder=base_dir\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Response Generator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing file: LLM_fairness_stereotype.json -> d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\LLM_fairness_stereotype_enhanced.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing items: 100%|██████████| 140/140 [01:12<00:00,  1.92it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing file: LLM_fairness_disparagement.json -> d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\LLM_fairness_disparagement_enhanced.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing items: 100%|██████████| 211/211 [00:00<00:00, 11767.81it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing file: LLM_fairness_preference.json -> d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\LLM_fairness_preference_enhanced.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing items: 100%|██████████| 15/15 [00:00<?, ?it/s]\n"
     ]
    }
   ],
   "source": [
    "from trusteval import generate_responses\n",
    "\n",
    "request_type = ['llm']\n",
    "async_list = ['gpt-4o']\n",
    "await generate_responses(\n",
    "    data_folder=base_dir,\n",
    "    request_type=request_type,\n",
    "    async_list=async_list\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Judge Processor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing file: LLM_fairness_stereotype.json with task type: fairness_stereotype_groundtruth\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 140/140 [01:06<00:00,  2.12it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Results saved to d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\LLM_fairness_stereotype_enhanced_responses_judge.json\n",
      "Processing file: LLM_fairness_disparagement.json with task type: fairness_disparagement_open_ended\n",
      "File d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\LLM_fairness_disparagement_enhanced_responses_judge.json already exists. Loading existing responses.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 211/211 [00:00<00:00, 49926.56it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Results saved to d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\LLM_fairness_disparagement_enhanced_responses_judge.json\n",
      "Processing file: LLM_fairness_preference.json with task type: fairness_preference_open_ended\n",
      "File d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\LLM_fairness_preference_enhanced_responses_judge.json already exists. Loading existing responses.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 15/15 [00:00<00:00, 15105.54it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Results saved to d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\LLM_fairness_preference_enhanced_responses_judge.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "from trusteval.src.evaluation import judge_responses\n",
    "target_models = async_list\n",
    "judge_type = 'llm'\n",
    "judge_model = ['gpt-4o-mini']\n",
    "config_path = os.path.join(parent_dir, 'trusteval/src/config/judge_prompt.yaml')\n",
    "\n",
    "await judge_responses(\n",
    "    data_folder=base_dir,\n",
    "    async_judge_model=judge_model,\n",
    "    target_models=target_models,\n",
    "    judge_type=judge_type,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Metrics successfully exported to d:\\paper\\TrustEval-toolkit\\examples\\test\\fairness_llm\\fairness_llm_metrics.csv\n"
     ]
    }
   ],
   "source": [
    "from trusteval.src.evaluation.lm_evaluator import metric_generation\n",
    "async_list = ['gpt-4o']\n",
    "\n",
    "metric_generation(\n",
    "    base_dir=base_dir,\n",
    "    aspect=\"fairness_llm\",\n",
    "    model_list=async_list\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "api",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
