{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c8ec4a03",
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import re\n",
    "import pandas\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "291b363b",
   "metadata": {},
   "outputs": [],
   "source": [
    "date_subpatn = '[0-9]{1,2}/[0-9]{1,2}/[0-9]{4}'\n",
    "date_subpatn_comp = re.compile(date_subpatn)\n",
    "date_patn = re.compile(f'^[\\S\\s]*\\[{date_subpatn}( - {date_subpatn})?\\]$')\n",
    "\n",
    "ansr_patn = re.compile('^([0-9]{0,3}%|[*])')\n",
    "\n",
    "citn_patn = re.compile('^Citation: ')\n",
    "\n",
    "qnid_patn = re.compile('^Question: \\[[\\w.]+\\]')\n",
    "\n",
    "stid_patn = re.compile('^Study: \\[Roper #[0-9.]+\\]')\n",
    "\n",
    "demo_patn = re.compile('^Sample: ')\n",
    "\n",
    "resn_patn = re.compile('^Sample Size: [0-9]+')\n",
    "\n",
    "strt_patn = re.compile('^-{60}$')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "117ebb73",
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_date(raw_line):\n",
    "    return re.findall(date_subpatn_comp, raw_line.strip())[-1]\n",
    "\n",
    "def parse_ques(raw_line):\n",
    "    return raw_line.strip()\n",
    "\n",
    "def parse_ansr(raw_lines):\n",
    "    # Note that answer ratios can be any of: ##%, #%, %, or *. Both % and * should be interpreted as 0 FIXED\n",
    "    # answer groups can have blank lines :/ FIXED\n",
    "    # answer groups can have new lines mid answer :/ FIXED\n",
    "\n",
    "    regrouped_lines = []\n",
    "\n",
    "    for ln_raw in raw_lines:\n",
    "        stripped = ln_raw.strip()\n",
    "        if re.match(ansr_patn, stripped):\n",
    "            if stripped[0] == '%' or stripped[0] == '*':\n",
    "                stripped = '0%' + stripped[1:]\n",
    "            regrouped_lines.append(stripped)\n",
    "        else:\n",
    "            regrouped_lines[-1] = (regrouped_lines[-1] + ' ' + ln_raw).strip()\n",
    "\n",
    "    ratios, answer_txts = [], []\n",
    "    for ln in regrouped_lines:\n",
    "        ln_split = ln.split(maxsplit=1)\n",
    "\n",
    "        answer_txts.append(ln_split[1])\n",
    "        ratios.append(int(ln_split[0][:-1]))\n",
    "        \n",
    "    return answer_txts, ratios\n",
    "\n",
    "def parse_qnid(raw_line):\n",
    "    stripped = raw_line.strip()\n",
    "    if not stripped.startswith('Question: [') or not stripped.endswith(']'):\n",
    "        raise ValueError(f'Error: could not parse QNID. stripped raw line - {stripped}')\n",
    "    return stripped[11:-1]\n",
    "\n",
    "def parse_stid(raw_line):\n",
    "    stripped = raw_line.strip()\n",
    "    if not stripped.startswith('Study: [Roper #') or not stripped.endswith(']'):\n",
    "        raise ValueError(f'Error: could not parse STID. stripped raw line - {stripped}')\n",
    "    return stripped[15:-1]\n",
    "\n",
    "def parse_citn(raw_line):\n",
    "    stripped = raw_line.strip()\n",
    "    if not stripped.startswith('Citation: '):\n",
    "        raise ValueError(f'Error: could not parse CITN. stripped raw line - {stripped}')\n",
    "    return stripped[10:].split('.')[0]\n",
    "\n",
    "def parse_demo(raw_line):\n",
    "    stripped = raw_line.strip()\n",
    "    if not stripped.startswith('Sample: '):\n",
    "        raise ValueError(f'Error: could not parse DEMO. stripped raw line - {stripped}')\n",
    "    return stripped[8:]\n",
    "\n",
    "def parse_resn(raw_line):\n",
    "    stripped = raw_line.strip()\n",
    "    if not stripped.startswith('Sample Size: '):\n",
    "        raise ValueError(f'Error: could not parse RESN. stripped raw line - {stripped}')\n",
    "    smp_size = stripped[13:]\n",
    "    try:\n",
    "        smp_size = int(smp_size)\n",
    "    except:\n",
    "        raise ValueError(f'Error: cound not convert sample count to numeric. As str - {smp_size}')\n",
    "    return smp_size\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "72931742",
   "metadata": {},
   "outputs": [],
   "source": [
    "# All phrases found in dataset that indicate clearly that some subset\n",
    "# are non-responses. Explicitly does not omit choices that are non-commital\n",
    "# responses (e.g. 'unsure' or 'don\\'t know')\n",
    "refusal_phrases = [\n",
    "    'don\\'t know/refused',\n",
    "    'don\\'t know/skipped',\n",
    "    'don\\'t know/skippedrefused',\n",
    "    'no answer',\n",
    "    'not selected',\n",
    "    'not selected/no answer',\n",
    "    'not sure/refused',\n",
    "    'not sure/skipped',\n",
    "    'omit',\n",
    "    'refused',\n",
    "    'refused/web blank',\n",
    "    'skip',\n",
    "    'skipped',\n",
    "    'skipped on web',\n",
    "    'skipped/refused',\n",
    "    'skipped/web blank',\n",
    "    'web blank',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "3afe7fc3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse(fn, shuffle_answers=True):\n",
    "    field_names = [\n",
    "        'Date',\n",
    "        'Question',\n",
    "        'Answers',\n",
    "        'Answer_Ratios',\n",
    "        'Citation',\n",
    "        'Question_Id',\n",
    "        'Study_Id',\n",
    "        'Demographic',\n",
    "        'Response_Count',\n",
    "    ]\n",
    "\n",
    "    \n",
    "\n",
    "    data_dict = {field_nm : [] for field_nm in field_names}\n",
    "\n",
    "    excluded_ref_len = 0\n",
    "    excluded_ans_count = 0\n",
    "\n",
    "    with open(fn, encoding='utf-8') as data_file:\n",
    "        lines = data_file.readlines()\n",
    "        num_lines = len(lines)\n",
    "        pointer = 2 # skip first two lines, always iRoper metaheader and blank line\n",
    "\n",
    "        while pointer < num_lines:\n",
    "            while not re.match(date_patn, lines[pointer]):\n",
    "                pointer += 1\n",
    "            date_loc = pointer\n",
    "            \n",
    "            pointer += 2\n",
    "            ques_loc = pointer\n",
    "            if lines[ques_loc].strip() == '':\n",
    "                raise ValueError(f'Unexpected format(ANSR) at line {date_loc+1}.\\n{lines[date_loc]}')\n",
    "\n",
    "            while not re.match(ansr_patn, lines[pointer]) and not re.match(strt_patn, lines[pointer]):\n",
    "                if re.match(strt_patn, lines[pointer]):\n",
    "                    raise ValueError(f'Unexpected next question at line {pointer}. Expected: ANSR')\n",
    "                pointer += 1\n",
    "            ansr_loc = pointer\n",
    "\n",
    "            while not re.match(citn_patn, lines[pointer]) and not re.match(strt_patn, lines[pointer]):\n",
    "                if re.match(strt_patn, lines[pointer]):\n",
    "                    raise ValueError(f'Unexpected next question at line {pointer}. Expected: CITN')\n",
    "                pointer += 1\n",
    "            citn_loc = pointer\n",
    "\n",
    "            while not re.match(qnid_patn, lines[pointer]) and not re.match(strt_patn, lines[pointer]):\n",
    "                if re.match(strt_patn, lines[pointer]):\n",
    "                    raise ValueError(f'Unexpected next question at line {pointer}. Expected: QNID')\n",
    "                pointer += 1\n",
    "            qnid_loc = pointer\n",
    "\n",
    "            while not re.match(stid_patn, lines[pointer]) and not re.match(strt_patn, lines[pointer]):\n",
    "                if re.match(strt_patn, lines[pointer]):\n",
    "                    raise ValueError(f'Unexpected next question at line {pointer}. Expected: STID')\n",
    "                pointer += 1\n",
    "            stid_loc = pointer\n",
    "\n",
    "            while not re.match(demo_patn, lines[pointer]) and not re.match(strt_patn, lines[pointer]):\n",
    "                if re.match(strt_patn, lines[pointer]):\n",
    "                    raise ValueError(f'Unexpected next question at line {pointer}. Expected: DEMO')\n",
    "                pointer += 1\n",
    "            demo_loc = pointer\n",
    "\n",
    "            while not re.match(resn_patn, lines[pointer]) and not re.match(strt_patn, lines[pointer]):\n",
    "                if re.match(strt_patn, lines[pointer]):\n",
    "                    raise ValueError(f'Unexpected next question at line {pointer}. Expected: RESN')\n",
    "                pointer += 1\n",
    "            resn_loc = pointer\n",
    "\n",
    "            while pointer < num_lines and not re.match(strt_patn, lines[pointer]):\n",
    "                pointer += 1\n",
    "\n",
    "            # Extract values\n",
    "            date = parse_date(lines[date_loc])\n",
    "            ques = parse_ques(lines[ques_loc])\n",
    "            ansr, rati = parse_ansr(lines[ansr_loc:citn_loc-1])\n",
    "            citn = parse_citn(lines[citn_loc])\n",
    "            qnid = parse_qnid(lines[qnid_loc])\n",
    "            stid = parse_stid(lines[stid_loc])\n",
    "            demo = parse_demo(lines[demo_loc])\n",
    "            resn = parse_resn(lines[resn_loc])\n",
    "\n",
    "            # Rough way to exclude multi-select questions where possible. Total should add up to\n",
    "            # 100, with some allowance for over/under summing from rounding errors. Rounding errors\n",
    "            # can never explain vatiation from 100 greater than the number of choices. Will likely\n",
    "            # have missed some questions where the total adds up to ~100 by happenstance. This is\n",
    "            # likely a necessary evil and unfixable. Should be safe to ignore.\n",
    "            if sum(rati) <= 100 - len(rati) or sum(rati) >= 100 + len(rati):\n",
    "                excluded_ans_count += 1\n",
    "                continue\n",
    "\n",
    "            # Additional step: remove answer choices that are identifiably just some variation of\n",
    "            # refused or failed to answer. Also remove questions with only one or no available\n",
    "            # answer (including ones that had more than 1 answers before filtering).\n",
    "            filt_ansr, filt_rati = [], []\n",
    "            for a, r in zip(ansr, rati):\n",
    "                if a.lower() not in refusal_phrases:\n",
    "                    filt_ansr.append(a)\n",
    "                    filt_rati.append(r)\n",
    "            ansr, rati = filt_ansr, filt_rati\n",
    "            \n",
    "            if len(ansr) <= 1:\n",
    "                excluded_ref_len += 1\n",
    "                continue\n",
    "\n",
    "            if shuffle_answers:\n",
    "                # Shuffle answers to control for positional biases across questions with similar answer options\n",
    "                ansr, rati = zip(*random.sample(list(zip(ansr, rati)), k=len(ansr)))\n",
    "                ansr, rati = list(ansr), list(rati)\n",
    "\n",
    "            data_dict['Date'].append(date)\n",
    "            data_dict['Question'].append(ques)\n",
    "            data_dict['Answers'].append(ansr)\n",
    "            data_dict['Answer_Ratios'].append(rati)\n",
    "            data_dict['Citation'].append(citn)\n",
    "            data_dict['Question_Id'].append(qnid)\n",
    "            data_dict['Study_Id'].append(stid)\n",
    "            data_dict['Demographic'].append(demo)\n",
    "            data_dict['Response_Count'].append(resn)\n",
    "\n",
    "    print(f'{excluded_ans_count} rows removed for invalid answer ratios')\n",
    "    print(f'{excluded_ref_len} rows removed for too few non-refusal options')\n",
    "\n",
    "\n",
    "    df = pandas.DataFrame.from_dict(data_dict)\n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce091ae6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "572 rows removed for invalid answer ratios\n",
      "332 rows removed for too few non-refusal options\n"
     ]
    }
   ],
   "source": [
    "input_filename = 'iroper_dl.txt'\n",
    "\n",
    "output_fn_csv = 'iroper_data.csv'\n",
    "output_fn_pkl = 'iroper_data.pkl'\n",
    "\n",
    "sample = True\n",
    "sample_n = 3000\n",
    "shuffle_answers = True\n",
    "\n",
    "df = parse(input_filename, shuffle_answers=shuffle_answers)\n",
    "\n",
    "if sample:\n",
    "    df = df.sample(sample_n, ignore_index=True)\n",
    "\n",
    "df.to_csv(output_fn_csv)\n",
    "df.to_pickle(output_fn_pkl)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1515f3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
