{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from utils import *\n",
    "\n",
    "%load_ext autoreload  \n",
    "%autoreload 2 \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data preprocessing \n",
    "\n",
    "> includes functions from utils.py to clean text and extract categories from (potentially recursive) LabelStudio-formatted data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('annotated_news_with_metrics.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.columns = data.columns.str.replace('generated_article_', '', regex=False)\n",
    "\n",
    "data['info_density_cv_surprisal'] = data['info_density'].apply(lambda x: eval(x)['cv_surprisal'])\n",
    "data['info_density_std_surprisal'] = data['info_density'].apply(lambda x: eval(x)['std_surprisal'])\n",
    "data['info_density_mean_surprisal'] = data['info_density'].apply(lambda x: eval(x)['mean_surprisal'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.slop_code_sq = data.slop_code_sq.apply(lambda x: extract_codes(x))\n",
    "data.slop_code_iq = data.slop_code_iq.apply(lambda x: extract_codes(x))\n",
    "data.slop_code_iu = data.slop_code_iu.apply(lambda x: extract_codes(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['is_slop'] = data['is_slop'].apply(lambda x: 1 if x.lower() == 'yes' else 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# automatic metric feature columns\n",
    "feature_cols = [\n",
    "    'bias', \n",
    "    'cr', \n",
    "    'cr_pos', \n",
    "    'template_rate', \n",
    "    'templates_per_token',\n",
    "    'sentence_length', \n",
    "    'word_length', \n",
    "    'num_words', \n",
    "    'num_sentences', \n",
    "    'fkgl', \n",
    "    'fre',\n",
    "    'gfi', \n",
    "    'prp_info_density', \n",
    "    'info_density_cv_surprisal', \n",
    "    'info_density_std_surprisal',\n",
    "    'info_density_mean_surprisal'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# human annotation columns\n",
    "LABELS = [\n",
    "    'IQ1: Factuality', \n",
    "    'IQ2: Bias',\n",
    "    'IU1: Information Density',       \n",
    "    'IU2: Information Relevance',\n",
    "    'SQ1: Repetition', \n",
    "    'SQ2: Templatedness', \n",
    "    'SQ3: Coherence',\n",
    "    'SQ4: Fluency',              \n",
    "    'SQ5: Verbosity', \n",
    "    'SQ6: Word Complexity', \n",
    "    'SQ7: Tone',\n",
    "]\n",
    "\n",
    "# collapsed labels (some redundant codes from adjudication)\n",
    "collapse_map = {\n",
    "        \"Density\":   [\"IU1: Information Density\"],\n",
    "        \"Relevance\": [\"IU2: Information Relevance\"],\n",
    "        \"Factuality\": [\"IQ1: Factuality\"],\n",
    "        \"Bias\":       [\"IQ2: Bias\"],\n",
    "        \"Structure\":  [\"SQ1: Repetition\", \"SQ2: Templatedness\"],\n",
    "        \"Coherence\":  [\"SQ3: Coherence\"],\n",
    "        \"Tone\":       [\"SQ4: Fluency\", \"SQ5: Verbosity\", \"SQ6: Word Complexity\", \"SQ7: Tone\"],\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
