{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "from pcfg import PCFG\n",
    "from utils_grammar import compute_random_guess_metric, get_grammar_string, get_grammatical_sentences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Grammar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pcfg_cfg3b_disjoint_terminals\n",
      "Random guess loss: 2.1972245773362196\n",
      "\n",
      "        S -> A16 [1]\n",
      "        A16 -> A15 A13 [0.50]\n",
      "        A16 -> A13 A15 A14 [0.50]\n",
      "        A13 -> A11 A12 [0.50]\n",
      "        A13 -> A12 A11 [0.50]\n",
      "        A14 -> A11 A10 A12 [0.50]\n",
      "        A14 -> A10 A11 A12 [0.50]\n",
      "        A15 -> A12 A11 A10 [0.50]\n",
      "        A15 -> A11 A12 A10 [0.50]\n",
      "        A10 -> A7 A9 A8 [0.50]\n",
      "        A10 -> A9 A8 A7 [0.50]\n",
      "        A11 -> A8 A7 A9 [0.50]\n",
      "        A11 -> A7 A8 A9 [0.50]\n",
      "        A12 -> A8 A9 A7 [0.50]\n",
      "        A12 -> A9 A7 A8 [0.50]\n",
      "        A7 -> '3' '1' [0.50]\n",
      "        A7 -> '1' '2' '3' [0.50]\n",
      "        A8 -> '6' '5' [0.50]\n",
      "        A8 -> '6' '4' '5' [0.50]\n",
      "        A9 -> '9' '8' '7' [0.50]\n",
      "        A9 -> '8' '7' [0.50]\n",
      "    \n"
     ]
    }
   ],
   "source": [
    "grammar_name = \"pcfg_cfg3b_disjoint_terminals\"\n",
    "\n",
    "\n",
    "print(grammar_name)\n",
    "print(f\"Random guess loss: {compute_random_guess_metric(grammar_name)}\") # Consider uniform terminals\n",
    "grammar_string = get_grammar_string(grammar_name)\n",
    "print(grammar_string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\nIf you want to manually generate the data\\n'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "If you want to manually generate the data\n",
    "\"\"\"\n",
    "# num_samples = 10000\n",
    "# seed = 5\n",
    "# grammar = PCFG.fromstring(grammar_string)\n",
    "# sequences, sentence_to_non_terminal_applied_position_map, sequence_freq, sentence_prob_dict = get_grammatical_sentences(grammar, num_samples, seed)\n",
    "# len(sequences)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset train_sequences | Length = 5000\n",
      "Dataset test_sequences | Length = 5000\n",
      "Dataset non_grammatical_sequences | Length = 5000\n",
      "Dataset non_grammatical_test_sequences_grammar_edit_1_1_all | Length = 1000\n",
      "Dataset non_grammatical_test_sequences_grammar_edit_2_1_all | Length = 1000\n",
      "Dataset non_grammatical_test_sequences_grammar_edit_3_1_all | Length = 1000\n",
      "Dataset non_grammatical_test_sequences_grammar_edit_4_1_all | Length = 1000\n",
      "Dataset non_grammatical_test_sequences_edit_distance_1_31_68 | Length = 4770\n",
      "Dataset non_grammatical_test_sequences_edit_distance_2_31_68 | Length = 4902\n",
      "Dataset non_grammatical_test_sequences_edit_distance_3_31_68 | Length = 4976\n",
      "Dataset non_grammatical_train_sequences_edit_distance_1_31_69 | Length = 4752\n",
      "Dataset non_grammatical_train_sequences_edit_distance_2_31_69 | Length = 4909\n",
      "Dataset non_grammatical_train_sequences_edit_distance_3_31_69 | Length = 4983\n"
     ]
    }
   ],
   "source": [
    "filename = f\"sequences_w_edit_distance_{grammar_name}_10000_5.pkl\"\n",
    "with open(filename, 'rb') as f:\n",
    "    data = pickle.load(f)\n",
    "\n",
    "# only consider train_sequences and test_sequences \n",
    "for key in data:\n",
    "    print(f\"Dataset {key} | Length = {len(data[key])}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('9', '8', '7', '3', '1', '6', '5', '1', '2', '3', '6', '5', '8', '7', '1', '2', '3', '8', '7', '6', '4', '5', '9', '8', '7', '1', '2', '3', '6', '4', '5', '3', '1', '6', '4', '5', '8', '7')\n",
      "('6', '5', '3', '1', '8', '7', '9', '8', '7', '3', '1', '6', '4', '5', '3', '1', '9', '8', '7', '6', '5', '8', '7', '1', '2', '3', '6', '4', '5', '6', '4', '5', '1', '2', '3', '9', '8', '7')\n",
      "('6', '4', '5', '3', '1', '8', '7', '9', '8', '7', '1', '2', '3', '6', '4', '5', '6', '4', '5', '1', '2', '3', '9', '8', '7', '8', '7', '3', '1', '6', '4', '5', '3', '1', '9', '8', '7', '6', '4', '5', '3', '1', '8', '7', '6', '4', '5', '1', '2', '3', '6', '5', '8', '7', '8', '7', '1', '2', '3', '6', '4', '5')\n",
      "('6', '5', '8', '7', '1', '2', '3', '6', '4', '5', '1', '2', '3', '8', '7', '1', '2', '3', '9', '8', '7', '6', '4', '5', '6', '4', '5', '9', '8', '7', '1', '2', '3', '1', '2', '3', '6', '4', '5', '9', '8', '7')\n",
      "('3', '1', '6', '5', '8', '7', '9', '8', '7', '3', '1', '6', '5', '1', '2', '3', '8', '7', '6', '5', '9', '8', '7', '1', '2', '3', '6', '4', '5', '1', '2', '3', '6', '4', '5', '9', '8', '7')\n",
      "('6', '4', '5', '1', '2', '3', '9', '8', '7', '6', '4', '5', '8', '7', '3', '1', '6', '5', '3', '1', '9', '8', '7', '6', '5', '9', '8', '7', '1', '2', '3', '8', '7', '6', '4', '5', '3', '1', '8', '7', '6', '4', '5', '1', '2', '3', '3', '1', '6', '4', '5', '8', '7', '9', '8', '7', '3', '1', '6', '5')\n",
      "('1', '2', '3', '6', '4', '5', '9', '8', '7', '6', '4', '5', '9', '8', '7', '1', '2', '3', '3', '1', '9', '8', '7', '6', '4', '5', '6', '5', '8', '7', '3', '1', '6', '4', '5', '1', '2', '3', '8', '7')\n",
      "('6', '4', '5', '3', '1', '9', '8', '7', '9', '8', '7', '1', '2', '3', '6', '5', '3', '1', '9', '8', '7', '6', '5', '8', '7', '1', '2', '3', '6', '4', '5', '3', '1', '6', '5', '8', '7')\n",
      "('9', '8', '7', '3', '1', '6', '4', '5', '6', '5', '1', '2', '3', '9', '8', '7', '8', '7', '6', '4', '5', '1', '2', '3', '3', '1', '6', '4', '5', '9', '8', '7', '9', '8', '7', '1', '2', '3', '6', '5')\n",
      "('9', '8', '7', '1', '2', '3', '6', '5', '6', '4', '5', '3', '1', '8', '7', '8', '7', '6', '5', '1', '2', '3', '1', '2', '3', '6', '5', '8', '7', '8', '7', '1', '2', '3', '6', '5')\n"
     ]
    }
   ],
   "source": [
    "for i in range(10):\n",
    "    print(data['train_sequences'][i])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Meta data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['non_terminal_applied_position_map', 'sequence_freq', 'sentence_prob_dict'])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filename_meta = f\"meta_data_{grammar_name}_10000_5.pkl\"\n",
    "\n",
    "with open(filename_meta, 'rb') as f:\n",
    "    meta_data = pickle.load(f)\n",
    "\n",
    "meta_data.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0: 1.1920928955078125e-07\n",
      "1: 1.1920928955078125e-07\n",
      "2: 1.4551915228366852e-11\n",
      "3: 1.1920928955078125e-07\n",
      "4: 1.1920928955078125e-07\n",
      "5: 1.4551915228366852e-11\n",
      "6: 1.1920928955078125e-07\n",
      "7: 1.1920928955078125e-07\n",
      "8: 1.1920928955078125e-07\n",
      "9: 1.1920928955078125e-07\n",
      "\n",
      "0: 1.1920928955078125e-07\n",
      "1: 1.4551915228366852e-11\n",
      "2: 1.1920928955078125e-07\n",
      "3: 1.1920928955078125e-07\n",
      "4: 1.1920928955078125e-07\n",
      "5: 1.4551915228366852e-11\n",
      "6: 1.4551915228366852e-11\n",
      "7: 1.4551915228366852e-11\n",
      "8: 1.4551915228366852e-11\n",
      "9: 1.4551915228366852e-11\n"
     ]
    }
   ],
   "source": [
    "# get the generation probability of each sentence\n",
    "# train\n",
    "for i in range(10):\n",
    "    print(f\"{i}: {meta_data['sentence_prob_dict'][data['train_sequences'][i]]}\")\n",
    "print()\n",
    "\n",
    "\n",
    "# test\n",
    "for i in range(10):\n",
    "    print(f\"{i}: {meta_data['sentence_prob_dict'][data['test_sequences'][i]]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "formal_grammars",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
