{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "ff5956f2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pymupdf in c:\\users\\msart\\anaconda3\\envs\\ic\\lib\\site-packages (1.23.3)\n",
      "Collecting pymupdf\n",
      "  Obtaining dependency information for pymupdf from https://files.pythonhosted.org/packages/f0/6d/8b646ba3096cd5ccbd9714c367f9faca18fca072eed83f5622050dbbfde3/PyMuPDF-1.23.4-cp311-none-win_amd64.whl.metadata\n",
      "  Downloading PyMuPDF-1.23.4-cp311-none-win_amd64.whl.metadata (3.4 kB)\n",
      "Requirement already satisfied: PyPDF2 in c:\\users\\msart\\anaconda3\\envs\\ic\\lib\\site-packages (3.0.1)\n",
      "Requirement already satisfied: PyMuPDFb==1.23.3 in c:\\users\\msart\\anaconda3\\envs\\ic\\lib\\site-packages (from pymupdf) (1.23.3)\n",
      "Downloading PyMuPDF-1.23.4-cp311-none-win_amd64.whl (3.5 MB)\n",
      "   ---------------------------------------- 0.0/3.5 MB ? eta -:--:--\n",
      "   - -------------------------------------- 0.1/3.5 MB 2.4 MB/s eta 0:00:02\n",
      "   ----------- ---------------------------- 1.0/3.5 MB 10.9 MB/s eta 0:00:01\n",
      "   ------------ --------------------------- 1.0/3.5 MB 11.0 MB/s eta 0:00:01\n",
      "   ------------ --------------------------- 1.0/3.5 MB 11.0 MB/s eta 0:00:01\n",
      "   ------------------------ --------------- 2.1/3.5 MB 9.1 MB/s eta 0:00:01\n",
      "   ---------------------------------------  3.5/3.5 MB 12.9 MB/s eta 0:00:01\n",
      "   ---------------------------------------- 3.5/3.5 MB 12.2 MB/s eta 0:00:00\n",
      "Installing collected packages: pymupdf\n",
      "  Attempting uninstall: pymupdf\n",
      "    Found existing installation: PyMuPDF 1.23.3\n",
      "    Uninstalling PyMuPDF-1.23.3:\n",
      "      Successfully uninstalled PyMuPDF-1.23.3\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ERROR: Could not install packages due to an OSError: [WinError 5] Acesso negado: 'C:\\\\Users\\\\msart\\\\AppData\\\\Local\\\\Temp\\\\pip-uninstall-fc_rmfjc\\\\_fitz.cp311-win_amd64.pyd'\n",
      "Consider using the `--user` option or check the permissions.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%pip install --upgrade pymupdf PyPDF2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "902a984d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from PyPDF2 import PdfReader\n",
    "import re,fitz\n",
    "import pandas as pd\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0cb1c48e-4c74-45c2-bc09-7a59c65d5563",
   "metadata": {},
   "outputs": [],
   "source": [
    "ocr = False\n",
    "if ocr:\n",
    "    from tempfile import TemporaryDirectory\n",
    "    import pytesseract\n",
    "    from pdf2image import convert_from_path\n",
    "    from PIL import Image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f234b266",
   "metadata": {},
   "outputs": [],
   "source": [
    "#CHANGE FILENAME TO THE NEW ONE\n",
    "FILENAME = 'ProvasSemImagem/ENEM_2021_P1_CAD_11_DIA_2_LARANJA_LEDOR.pdf'\n",
    "OUTNAME = 'Data/ENEM_2021_P1_CAD_11_DIA_2_LARANJA_LEDOR.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "69126a2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "class EnemAutomata:\n",
    "    def __init__(self):\n",
    "        self.state = 0\n",
    "        self.question = {}\n",
    "        self.state_dict = {\n",
    "            0:'header',\n",
    "            1:'body',\n",
    "            'A':'alternative A',\n",
    "            'B':'alternative B',\n",
    "            'C':'alternative C',\n",
    "            'D':'alternative D',\n",
    "            'E':'alternative E',\n",
    "            'R':'Essay'\n",
    "        }\n",
    "    def clear_memory(self):\n",
    "        self.state = 0\n",
    "        self.question = {}\n",
    "        \n",
    "    def letter_state(self,current_state,next_state,part):\n",
    "        if part.strip() != next_state and part.strip() != next_state.lower()+'.':\n",
    "            if current_state not in self.question:\n",
    "                self.question[current_state] =''\n",
    "            self.question[current_state]+=(part.strip('\\n')+' ').replace('  ',' ')\n",
    "        else:\n",
    "            self.state = next_state\n",
    "        \n",
    "    def read(self,part):\n",
    "        #final state\n",
    "        if self.state=='E' and (re.search('questão [\\d]+',part.lower()) or re.search('ENDOFENEM',part) \n",
    "                              or re.search('LC -',part) or re.search('CH -',part) or\n",
    "                              re.search('\\*.*\\*',part) or re.search('instruções para a redação',part.lower())):\n",
    "            ret = self.question.copy()\n",
    "            self.clear_memory()\n",
    "            return ret\n",
    "        elif re.search('\\*.*\\*',part.lower()) and self.state == 'R':\n",
    "            ret = self.question.copy()\n",
    "            self.clear_memory()\n",
    "            return ret\n",
    "        \n",
    "        if not part:\n",
    "            return False\n",
    "        elif re.search('questão [\\d]+',part.lower()) and self.state == 0:\n",
    "            self.question['question'] = part.strip()\n",
    "            self.state = 1\n",
    "        elif (re.search('instruções para a redação',part.lower()) or re.search('PROPOSTA DE REDAÇÃO',part)) and self.state == 0:\n",
    "            self.question['question'] = 'redação'\n",
    "            self.question['body'] = ''\n",
    "            self.question['A'] = ''\n",
    "            self.question['B'] = ''\n",
    "            self.question['C'] = ''\n",
    "            self.question['D'] = ''\n",
    "            self.question['E'] = ''\n",
    "            self.state = 'R'\n",
    "        elif self.state =='R':\n",
    "            self.question['body']+=(part.strip('\\n')+' ').replace('  ',' ')\n",
    "        elif self.state == 1 and (part.strip() != 'A' and part.strip() != 'a.'):\n",
    "            if 'body' not in self.question:\n",
    "                self.question['body'] = ''\n",
    "            self.question['body']+=(part.strip('\\n')+' ').replace('  ',' ')\n",
    "        elif self.state==1:\n",
    "            self.state = 'A'\n",
    "        elif self.state != 0 and self.state != 1:\n",
    "            self.letter_state(self.state,chr(ord(self.state)+1),part)\n",
    "        return False            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "aa3bf6fb-4dda-4988-baea-dbd76a92f813",
   "metadata": {},
   "outputs": [],
   "source": [
    "class OCRAutomata:\n",
    "    def __init__(self):\n",
    "        self.state = 0\n",
    "        self.question = {}\n",
    "        self.state_dict = {\n",
    "            0:'header',\n",
    "            1:'body',\n",
    "            'A':'alternative A',\n",
    "            'B':'alternative B',\n",
    "            'C':'alternative C',\n",
    "            'D':'alternative D',\n",
    "            'E':'alternative E',\n",
    "            'R':'Essay'\n",
    "        }\n",
    "    def clear_memory(self):\n",
    "        self.state = 0\n",
    "        self.question = {}\n",
    "        self.question['question'] = ''\n",
    "        self.question['body']=''\n",
    "        self.question['A'] = None\n",
    "        self.question['B'] = None\n",
    "        self.question['C'] = None\n",
    "        self.question['D'] = None\n",
    "        self.question['E'] = None\n",
    "        \n",
    "    def read(self,part):\n",
    "        #final state\n",
    "        if self.state==1 and (re.search('questão [\\d]+',part.lower()) or re.search('ENDOFENEM',part) \n",
    "                              or re.search('LC -',part) or re.search('CH -',part) or\n",
    "                              re.search('\\*.*\\*',part) or re.search('instruções para a redação',part.lower())):\n",
    "            ret = self.question.copy()\n",
    "            self.clear_memory()\n",
    "            return ret\n",
    "        elif re.search('\\*.*\\*',part.lower()) and self.state == 'R':\n",
    "            ret = self.question.copy()\n",
    "            self.clear_memory()\n",
    "            return ret\n",
    "        if not part:\n",
    "            return False\n",
    "        elif re.search('questão [\\d]+',part.lower()) and self.state == 0:\n",
    "            self.question['question'] = re.search('questão [\\d]+',part.lower()).group()\n",
    "            self.state = 1\n",
    "        elif (re.search('instruções para a redação',part.lower()) or re.search('PROPOSTA DE REDAÇÃO',part)) and self.state == 0:\n",
    "            self.question['question'] = 'redação'\n",
    "            self.question['body'] = ''\n",
    "            self.question['A'] = None\n",
    "            self.question['B'] = None\n",
    "            self.question['C'] = None\n",
    "            self.question['D'] = None\n",
    "            self.question['E'] = None\n",
    "            self.state = 'R'\n",
    "        elif self.state =='R':\n",
    "            self.question['body']+=(part.strip('\\n')+' ').replace('  ',' ')\n",
    "        elif self.state == 1:\n",
    "            if 'body' not in self.question:\n",
    "                self.question['body'] = ''\n",
    "            self.question['body']+=(part.strip('\\n')+' ').replace('  ',' ')\n",
    "        return False            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "1dbd49d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "class PhysicalEnemParser:\n",
    "    def __init__(self,enem_object,engine='pypdf2'):\n",
    "        self.enem_object = enem_object\n",
    "        self.engine=engine\n",
    "        parts = []\n",
    "        if engine=='pymupdf':\n",
    "            for page_num in range(1,len(enem_object)):\n",
    "                page = enem_object[page_num]\n",
    "                image_list = page.get_images(full=True)\n",
    "                to_remove = []\n",
    "                for image in image_list:\n",
    "                    bbox = page.get_image_bbox(image)\n",
    "                    tb = page.get_textbox(bbox)\n",
    "                    to_remove.extend(tb.split('\\n'))\n",
    "                page_text = page.get_text().split('\\n')\n",
    "                for text in page_text:\n",
    "                    if text not in to_remove:\n",
    "                        parts.append(text)\n",
    "        if engine =='pypdf2':\n",
    "            def visitor_body(text, cm, tm, fontDict, fontSize):\n",
    "                parts.append(text.replace('[supressão de texto]','[...]'))\n",
    "\n",
    "            for page in enem_object.pages:\n",
    "                page.extract_text(visitor_text=visitor_body)\n",
    "            parts.append('ENDOFENEM')\n",
    "        #LINUX only\n",
    "        if engine =='OCR':\n",
    "            language_config = r'-l por --psm 1'\n",
    "            # Path of the Input pdf\n",
    "            PDF_file = enem_object\n",
    "            \n",
    "            # Store all the pages of the PDF in a variable\n",
    "            image_file_list = []\n",
    "            \n",
    "            with TemporaryDirectory() as tempdir:\n",
    "                # Create a temporary directory to hold our temporary images.\n",
    "                pdf_pages = convert_from_path(PDF_file, 500)\n",
    "                \n",
    "                # Iterate through all the pages stored above\n",
    "                for page_enumeration, page in enumerate(pdf_pages, start=1):        \n",
    "                    # Create a file name to store the image\n",
    "                    filename = f\"{tempdir}\\page_{page_enumeration:03}.jpg\"\n",
    "                    # Save the image of the page in system\n",
    "                    page.save(filename, \"JPEG\")\n",
    "                    image_file_list.append(filename)\n",
    "                parsed = ''\n",
    "                # Iterate from 1 to total number of pages\n",
    "                for image_file in image_file_list:\n",
    "                # Recognize the text as string in image using pytesserct\n",
    "                    text = str(((pytesseract.image_to_string(Image.open(image_file),config=language_config))))\n",
    "                    parsed+=text\n",
    "                parts = parsed.split('\\n')\n",
    "        self.parts=parts\n",
    "\n",
    "    def parse_questions(self):\n",
    "        self.automata = EnemAutomata()\n",
    "        if self.engine=='OCR':\n",
    "            self.automata = OCRAutomata()\n",
    "        questions=[]\n",
    "        for part in self.parts:\n",
    "            accept = self.automata.read(part)\n",
    "            while accept:\n",
    "                questions.append(accept)\n",
    "                accept = self.automata.read(part)\n",
    "        return questions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "3664e5a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "files = list(os.listdir('2023'))\n",
    "for fileno in files:\n",
    "    FILENAME = '2023/'+fileno\n",
    "    OUTNAME = 'Data/'+fileno.strip('.pdf')+'.csv'\n",
    "    enem = PdfReader(FILENAME)\n",
    "    parser = PhysicalEnemParser(enem,engine='pypdf2')\n",
    "    questions = parser.parse_questions()\n",
    "    df = pd.DataFrame(questions)\n",
    "    essay_instructions = df.loc[df['question']=='redação','body']\n",
    "    if not essay_instructions.empty:\n",
    "        essay_instructions = essay_instructions.iloc[0]\n",
    "        try:\n",
    "            df.loc[df['question']=='redação','body'] = re.sub(r'PROPOSTA DE REDAÇÃO.*?(?!(TEXTO))','',essay_instructions) + re.search(r'PROPOSTA DE REDAÇÃO.*?(?=TEXTO)',essay_instructions).group(0)\n",
    "        except:\n",
    "            pass\n",
    "    for column in df.columns:\n",
    "        df[column] = df[column].apply(lambda x: x.replace('\\t',' ').replace('  ',' ').strip())\n",
    "    df.to_csv(OUTNAME,index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8e4ec8b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
