{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e8629841",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os,re\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d0b63b78",
   "metadata": {},
   "outputs": [],
   "source": [
    "parsed = os.listdir('Data')\n",
    "parsed = [i for i in parsed if '2019' in i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "521ffe9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "codes = {\n",
    "    2011:{'MT':130,'CN':123,'CH':119,'LC':126},\n",
    "    2012:{'CN':153,'CH':154,'LC':155,'MT':156},\n",
    "    2013:{'CN':188,'CH':187,'LC':189,'MT':190},\n",
    "    2015:{'CN':277,'CH':273,'LC':280,'MT':284},\n",
    "    2016:{'CN':307,'CH':308,'LC':309,'MT':310},\n",
    "    2017:{'CN':407,'CH':408,'LC':409,'MT':410},\n",
    "    2018:{'CN':463,'CH':464,'LC':465,'MT':466},\n",
    "    2019:{'CN':519,'CH':520,'LC':521,'MT':522},\n",
    "    2020:{'CN':604,'CH':574,'LC':584,'MT':594},\n",
    "    2021:{'CN':916,'CH':886,'LC':896,'MT':906},\n",
    "    2022:{'CN':1092,'CH':1062,'LC':1072,'MT':1082},\n",
    "    2023:{'CN':0,'CH':0,'LC':0,'MT':0}\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b4c9bfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "#remove dumb parsing artifacts from 2023\n",
    "year='2023'\n",
    "base_path = \"./Data/\"\n",
    "enems = os.listdir(base_path)\n",
    "for file in enems:\n",
    "    if year in file:\n",
    "        df = pd.read_csv('Data/'+file)\n",
    "        for column in df:\n",
    "            df[column] = df[column].str.replace(r' -(?=[A-Za-z])','-',regex=True)\n",
    "            df[column] = df[column].str.replace(r'(?<=[A-Za-z])- ','-',regex=True)\n",
    "            df[column] = df[column].str.replace(r' -(?=[\\d])','-',regex=True)\n",
    "            df[column] = df[column].str.replace(r'(?<=[\\d])- ','-',regex=True)\n",
    "        #fix maths in the answers\n",
    "        for col in ['A','B','C','D','E','body']:\n",
    "            df[col] = df[col].apply(lambda text: re.sub(' ?,? ?[Ff]echa parêntese ?,?',')',re.sub(' ?,? ?[Aa]bre parêntese ?,? ?',' (',text)) if type(text)==str else text)\n",
    "        df.to_csv('Data/'+file,index=False)\n",
    "        display(df.iloc[-3]['body'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ab87fd38",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2019_PV_CAD_impresso_D1_CD9_laranja.csv\n",
      "95\n",
      "2019_PV_CAD_impresso_D2_CD11_laranja.csv\n",
      "90\n"
     ]
    }
   ],
   "source": [
    "dfs = {}\n",
    "\n",
    "\n",
    "def match_gab_to_df(df,gab):\n",
    "    answers=[]\n",
    "    languages=[]\n",
    "    used = set()\n",
    "    for line in df.to_dict(orient='records'):\n",
    "        if line['test']=='redação':\n",
    "            answers.append('')\n",
    "            languages.append('portuguese')\n",
    "            continue\n",
    "        possible_gabarito = gab[gab['question']==line['question_number']]\n",
    "        gabarito = possible_gabarito.iloc[0]\n",
    "        \n",
    "        if (str(gabarito['question'])+str(gabarito['test'])) in used:\n",
    "            gabarito = possible_gabarito.iloc[1]\n",
    "            \n",
    "        answers.append(gabarito['answer'])\n",
    "        languages.append(gabarito['test'] if not pd.isna(gabarito['test']) else 'portuguese')\n",
    "        used.add(str(gabarito['question'])+str(gabarito['test']))\n",
    "    return answers,languages\n",
    "\n",
    "\n",
    "for prova in parsed:\n",
    "    print(prova)\n",
    "    df = pd.read_csv('Data/'+prova)\n",
    "    print(len(df))\n",
    "    year = int(re.search(\"\\d\\d\\d\\d\",prova).group())\n",
    "    df['question'] = df['question'].apply(lambda question: question.upper())\n",
    "    df['year'] = year\n",
    "    df['question_number'] = df['question'].apply(lambda question: int(re.search(\"[\\d]+\",question).group()) if re.search(\"[\\d]+\",question) else np.nan)\n",
    "    df['test']='redação'\n",
    "    if 'D1_' in prova or 'ia1' in prova or 'DIA_1' in prova:\n",
    "        if year < 2017:\n",
    "            df.loc[df['question_number']<=45,'test'] = 'CH'\n",
    "            df.loc[df['question_number']>45,'test'] = 'CN'\n",
    "        else:\n",
    "            df.loc[df['question_number']<=45,'test'] = 'LC'\n",
    "            df.loc[df['question_number']>45,'test'] = 'CH'\n",
    "    else:\n",
    "        if year < 2017:\n",
    "            df.loc[df['question_number']<=135,'test'] = 'LC'\n",
    "            df.loc[df['question_number']>135,'test'] = 'MT'\n",
    "        else:\n",
    "            df.loc[df['question_number']<=135,'test'] = 'CN'\n",
    "            df.loc[df['question_number']>135,'test'] = 'MT'\n",
    "    #add gabarito data\n",
    "    if year != 2023:\n",
    "        gab = pd.read_csv('GabaritosCsv/'+prova.replace('CAD','GAB')).sort_values(by=['question','test'])\n",
    "    else:\n",
    "        gab = pd.read_csv('GabaritosCsv/'+prova.replace('_CH','').replace('_LC','').replace('_CN','').replace('_MT',''))\n",
    "    df['answer'],df['language'] = match_gab_to_df(df,gab)\n",
    "    #we want to sort dataframe in the following order: english -> spanish -> portuguese then question orders\n",
    "    #this is useful for the adding of microdata\n",
    "    df['sort_index'] = df['language'].apply(lambda lang: 0 if lang == 'english' else 1 if lang=='spanish' else 2)\n",
    "    df = df.sort_values(by=['sort_index','question_number']).drop(columns=['sort_index'])\n",
    "    \n",
    "    #fix maths in the answers\n",
    "    for col in ['A','B','C','D','E','body']:\n",
    "        df[col] = df[col].apply(lambda text: re.sub(' ?,? ?[Ff]echa parêntese ?,?',')',re.sub(' ?,? ?[Aa]bre parêntese ?,? ?',' (',text)) if type(text)==str else text)\n",
    "    \n",
    "    #add microdata data\n",
    "    if year != 2023:\n",
    "        important_columns = ['CO_PROVA','NU_PARAM_A','NU_PARAM_B','NU_PARAM_C','TP_LINGUA','CO_POSICAO','CO_HABILIDADE','TX_GABARITO']\n",
    "        microdata = pd.read_csv(f'microdados/ITENS_PROVA_{year}.csv',sep=';',usecols=important_columns, encoding='latin-1')\n",
    "        microdata.loc[microdata['TP_LINGUA'].isna(),'TP_LINGUA']='portuguese'\n",
    "        microdata.loc[microdata['TP_LINGUA']==0,'TP_LINGUA']='english'\n",
    "        microdata.loc[microdata['TP_LINGUA']==1,'TP_LINGUA']='spanish'\n",
    "    \n",
    "\n",
    "#     #divide tests\n",
    "    for test in df['test'].drop_duplicates().tolist():\n",
    "#         print(prova+' '+ test)\n",
    "        one_subject_test = df[df['test']==test].copy().reset_index(drop=True)\n",
    "        if test == 'redação':\n",
    "            one_subject_test.to_csv(f'ProvasComMicrodados/ENEM_{year}_{test}.csv')\n",
    "            continue\n",
    "        else:\n",
    "            CO_PROVA = codes[year][test]\n",
    "#             specific_microdata = microdata[microdata['CO_PROVA']==CO_PROVA].copy()\n",
    "#             one_subject_test['CO_PROVA'] = CO_PROVA\n",
    "\n",
    "#             try:\n",
    "#                 output = pd.merge(one_subject_test,specific_microdata,left_on=['question_number','language'],right_on=['CO_POSICAO','TP_LINGUA'])\n",
    "#                 if output.empty:\n",
    "#                     raise Exception(f'Failed matching')\n",
    "#             except:\n",
    "#                 one_subject_test['match'] = list(one_subject_test.index+1)\n",
    "#                 output = pd.merge(one_subject_test,specific_microdata,left_on=['match'],right_on=['CO_POSICAO']).drop(columns=['match'])\n",
    "#                 if not (output['TX_GABARITO']!=output['answer']).sum()==0:\n",
    "#                     print(f'Gabarito and answer should be the same for test {prova} {test}')\n",
    "#                     #CO_PROVA = str(CO_PROVA) + 'MAYBEBROKENDONOTUSE'\n",
    "#             if output.empty:\n",
    "#                 print('Output empty??')\n",
    "#             if len(output)!=45 and len(output)!=50:\n",
    "#                 raise Exception('Inconsistent Output')\n",
    "#             output.drop(columns=['CO_PROVA_x','TP_LINGUA','CO_POSICAO'],inplace=True)\n",
    "#             output.rename(columns={'CO_PROVA_y':'CO_PROVA'},inplace=True)\n",
    "#             output['question_number'] = output['question_number'].astype(int)\n",
    "        one_subject_test['question_number']=one_subject_test['question_number'].astype(int)\n",
    "        if len(one_subject_test)!=45 and len(one_subject_test)!=50:\n",
    "            print(prova,len(one_subject_test),test)\n",
    "        one_subject_test.to_csv(f'ProvasComMicrodados/ENEM_{year}_{test}_CO_PROVA_{CO_PROVA}.csv',index=False,encoding='utf-8')\n",
    "    dfs[prova] = df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f6078ae1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n"
     ]
    }
   ],
   "source": [
    "#remove dumb parsing artifacts from specific provas\n",
    "year=2022\n",
    "base_path = \"./ProvasComMicrodados/\"\n",
    "enems = os.listdir(base_path)\n",
    "df = pd.concat([pd.read_csv(base_path+file) for file in enems ]).sort_values('question_number').reset_index(drop=True)\n",
    "for file in enems:\n",
    "    if '2022' in file and 'redação' not in file:\n",
    "        df = pd.read_csv(base_path+file)\n",
    "        df['A'] = df['A'].str.lstrip('A').str.strip(' ')\n",
    "        print(df['A'].isna().sum())\n",
    "        df['B'] = df['B'].str.lstrip('B').str.strip(' ')\n",
    "        print(df['B'].isna().sum())\n",
    "        df['C'] = df['C'].str.lstrip('C').str.strip(' ')\n",
    "        print(df['C'].isna().sum())\n",
    "        df['D'] = df['D'].str.lstrip('D').str.strip(' ')\n",
    "        print(df['D'].isna().sum())\n",
    "        df['E'] = df['E'].str.lstrip('E').str.strip(' ')\n",
    "        print(df['E'].isna().sum())\n",
    "        df.to_csv(base_path+file,encoding='utf-8',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "783544f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "prova = 'ia2_caderno6_CAD_cinza_ledor_2013.csv'\n",
    "df = pd.read_csv('data/ia2_caderno6_CAD_cinza_ledor_2013.csv ',encoding='utf-8')\n",
    "year=2013\n",
    "df['question'] = df['question'].apply(lambda question: question.upper())\n",
    "df['year'] = year\n",
    "df['question_number'] = df['question'].apply(lambda question: int(re.search(\"[\\d]+\",question).group()) if re.search(\"[\\d]+\",question) else np.nan)\n",
    "df['test']='redação'\n",
    "if 'D1_' in prova or 'ia1' in prova or 'DIA_1' in prova:\n",
    "    if year < 2017:\n",
    "        df.loc[df['question_number']<=45,'test'] = 'CH'\n",
    "        df.loc[df['question_number']>45,'test'] = 'CN'\n",
    "    else:\n",
    "        df.loc[df['question_number']<=45,'test'] = 'LC'\n",
    "        df.loc[df['question_number']>45,'test'] = 'CH'\n",
    "else:\n",
    "    if year < 2017:\n",
    "        df.loc[df['question_number']<=135,'test'] = 'LC'\n",
    "        df.loc[df['question_number']>135,'test'] = 'MT'\n",
    "    else:\n",
    "        df.loc[df['question_number']<=135,'test'] = 'CN'\n",
    "        df.loc[df['question_number']>135,'test'] = 'MT'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d632933",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
