{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "cc5ee9fc-dd05-4230-b71b-0270edc76184",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "d7139b8e-f82d-4edd-ab3f-363f23b0e584",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9160"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open (\"./SEP_dataset.json\", \"r\") as f: \n",
    "    df = json.load(f) \n",
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "c4e5ff69-0f28-44ce-a6be-9defd1d2e67b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(892, 8268)"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val_df = []\n",
    "eval_df = []\n",
    "\n",
    "for elem in df:\n",
    "    task_id = elem[\"info\"][\"appended_task_id\"]\n",
    "    if task_id in (3, 11, 27, 34, 41) or task_id in (53, 61, 77, 84,91):\n",
    "        val_df.append(elem)\n",
    "    else:\n",
    "        eval_df.append(elem)\n",
    "len(val_df), len(eval_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "df266d3c-29e1-4a9b-8f4a-09ed05711290",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open (\"./SEP_validation_feb25.json\", \"w+\") as f: \n",
    "    json.dump(val_df, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "5db296e7-fecf-4640-a45c-9049fa967fff",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open (\"./SEP_evaluation_feb25.json\", \"w+\") as f: \n",
    "    json.dump(eval_df, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "bfd20cdd-311f-4a28-804e-2834ad115628",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "99"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "max_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "34d031ab-9baf-45ea-b7a3-a11be7de75ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"train_data/alpaca_52k_shuffled.json\", \"r\") as f:\n",
    "    text = f.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a5ac84f0-f0d8-4076-ba35-17558e2ee9d1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\"output\": \"The primary emotion conveyed in the poem is sadness.\", \"orig_ix\": 24346, \"type\": \"orig\"}]'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text[-100:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67572b45-81ab-4986-a90e-20f25e595fd5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
