{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "7af2eb9e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "\n",
    "word_site = \"https://www.mit.edu/~ecprice/wordlist.10000\"\n",
    "\n",
    "response = requests.get(word_site)\n",
    "WORDS = response.content.splitlines()\n",
    "WORDS = [word.decode('utf-8') for word in WORDS]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "2655e9a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import List\n",
    "import random \n",
    "def get_question(WORDS:List[str]):\n",
    "    question = [ WORDS[random.randint(0,len(WORDS)-1)] for i in range(random.randint(5,20))]\n",
    "    question[0] = question[0].capitalize()\n",
    "    return \" \".join(question)+\"?\"\n",
    "\n",
    "def get_answers(WORDS:List[str], num_answers = 4):\n",
    "    answers = [None] *num_answers\n",
    "    for i in range(num_answers):\n",
    "        answer = [ WORDS[random.randint(0,len(WORDS)-1)] for i in range(random.randint(1,6))]\n",
    "        answer[0] = answer[0].capitalize()\n",
    "        answers[i] =  \" \".join(answer)\n",
    "    return answers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "a778a803",
   "metadata": {},
   "outputs": [],
   "source": [
    "data =[]\n",
    "NUM_ANSWERS = 10\n",
    "NUM_QUESTIONS_TEST = 1000\n",
    "NUM_QUESTIONS_VALIDATION = 400\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "a8a5f3f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "def produce_dataset(WORDS,num_questions = 100, num_answers=4):\n",
    "    data = []\n",
    "    choices = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\", \"G\", \"H\", \"I\", \"J\", \"K\", \"L\", \"M\", \"N\", \"O\", \"P\"]\n",
    "    assert num_questions%num_answers == 0\n",
    "    ans = np.array([ [i]*(num_questions//num_answers) for i in range(0,num_answers)]).flatten().tolist()\n",
    "    random.shuffle(ans)\n",
    "    for i in range(0,num_questions):\n",
    "        data.append({\"question\":get_question(WORDS), \"options\":get_answers(WORDS,num_answers=num_answers), \"answer\":choices[ans[i]], \"answer_index\": ans[i]})\n",
    "    return data\n",
    "\n",
    "data = {}\n",
    "data['test'] = produce_dataset(WORDS, num_questions=NUM_QUESTIONS_TEST, num_answers=NUM_ANSWERS)\n",
    "data['validation'] = produce_dataset(WORDS, num_questions=NUM_QUESTIONS_VALIDATION, num_answers=NUM_ANSWERS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "63eb9986",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "os.makedirs('./data',exist_ok=True)\n",
    "with open(f\"./data/nonsense_{NUM_ANSWERS}.json\", \"w\") as f:\n",
    "    json.dump(data, f, indent=4)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mmlu",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
