{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Make datasets to analyze\n",
    "\n",
    "Each dataset should have a training set `train.pkl`, a validation set `val.pkl`. If there is noise involved, there should be an additional `train_noiselss.pkl`, `dev_noiseless.pkl`.\n",
    "\n",
    "This notebook serves as a manifest for reproducing precisely the datasets in this directory. Data access notes:\n",
    " - The data directory name is how you will access this data using `dataloader.py` "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# autoreload magic\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import os \n",
    "from mindreadingautobots.sequence_generators import make_datasets, data_io"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Generating counterexamples\n",
    "\n",
    "Here we pick specific bitflip rate $p$ and specific boolean functions for which the noisy optimal classifier is _lower_ sensitivity than the boolean function $f$, which results in the model getting ``tricked'' by the noise into not learning the boolean function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating counterexample100110_nbits20_n2000_bf30_seed1234 with p_bitflip=0.3\n",
      "idx for sparse function: save these: [3, 6, 7, 13, 18]\n",
      "Saved counterexample100110_nbits20_n2000_bf30_seed1234/train.pkl, counterexample100110_nbits20_n2000_bf30_seed1234/val.pkl, counterexample100110_nbits20_n2000_bf30_seed1234/noiseless_train.pkl, counterexample100110_nbits20_n2000_bf30_seed1234/noiseless_val.pkl\n",
      "Generating counterexample011000_nbits20_n2000_bf30_seed1234 with p_bitflip=0.3\n",
      "idx for sparse function: save these: [2, 5, 9, 15, 16]\n",
      "Saved counterexample011000_nbits20_n2000_bf30_seed1234/train.pkl, counterexample011000_nbits20_n2000_bf30_seed1234/val.pkl, counterexample011000_nbits20_n2000_bf30_seed1234/noiseless_train.pkl, counterexample011000_nbits20_n2000_bf30_seed1234/noiseless_val.pkl\n",
      "Generating counterexample011010_nbits20_n2000_bf30_seed1234 with p_bitflip=0.3\n",
      "idx for sparse function: save these: [0, 1, 4, 8, 12]\n",
      "Saved counterexample011010_nbits20_n2000_bf30_seed1234/train.pkl, counterexample011010_nbits20_n2000_bf30_seed1234/val.pkl, counterexample011010_nbits20_n2000_bf30_seed1234/noiseless_train.pkl, counterexample011010_nbits20_n2000_bf30_seed1234/noiseless_val.pkl\n"
     ]
    }
   ],
   "source": [
    "counterexample_signatures = [\n",
    "    [1, 0, 0, 1, 1, 0], # balanced! equal number of 0's and 1's\n",
    "    [0, 1, 1, 0, 0, 0], # imbalanced, but almost balanced\n",
    "    [0, 1, 1, 0, 1, 0], # very imbalanced\n",
    "]\n",
    "\n",
    "idx = [\n",
    "    [3, 6, 7, 13, 18],\n",
    "    [2, 5, 9, 15, 16],\n",
    "    [0, 1, 4, 8, 12]\n",
    "]\n",
    "\n",
    "p_bitflips = [0.3]\n",
    "n_bits = 20\n",
    "seed = 1234\n",
    "n_val = 10000 # number of validation examples\n",
    "n_train = 2000\n",
    "for signature, subseq_idx in zip(counterexample_signatures, idx):\n",
    "    k = len(subseq_idx)\n",
    "    gen_name = \"counterexample\" + \"\".join([str(i) for i in signature])\n",
    "    signature = dict(zip(range(len(signature)), signature))\n",
    "\n",
    "\n",
    "    for p_bitflip in p_bitflips:\n",
    "        p100 = int(p_bitflip*100)\n",
    "        suffix = f\"_nbits{n_bits}_n{n_train}_bf{p100}_seed{seed}\"\n",
    "        dirname = gen_name + suffix\n",
    "        print(f\"Generating {dirname} with p_bitflip={p_bitflip}\")\n",
    "        # If your dataset has a hidden subset, update this list:\n",
    "\n",
    "        X, Z, subseq_idx = make_datasets.sparse_boolean_weightbased_k_n(n_bits, k, n_train + n_val, signature, p_bitflip=p_bitflip, seed=seed, subseq_idx=subseq_idx)\n",
    "        print(\"idx for sparse function: save these:\", subseq_idx)\n",
    "\n",
    "        if p_bitflip == 0:\n",
    "            Z = X\n",
    "        Z_train = Z[:n_train]\n",
    "        Z_val = Z[n_train:]\n",
    "\n",
    "        # Check if the data directory exists, if not create it\n",
    "        if not os.path.exists(dirname):\n",
    "            os.makedirs(dirname)\n",
    "\n",
    "        train_path = f\"{dirname}/train.pkl\"\n",
    "        val_path = f\"{dirname}/val.pkl\"\n",
    "        data_io.save_numpy_as_dict(Z_train, train_path)\n",
    "        data_io.save_numpy_as_dict(Z_val, val_path)\n",
    "\n",
    "        X_train = X[:n_train]\n",
    "        X_val = X[n_train:]\n",
    "        noiseless_train_path = f\"{dirname}/noiseless_train.pkl\"\n",
    "        noiseless_val_path = f\"{dirname}/noiseless_val.pkl\"\n",
    "        data_io.save_numpy_as_dict(X_train, noiseless_train_path)\n",
    "        data_io.save_numpy_as_dict(X_val, noiseless_val_path)\n",
    "        print(f\"Saved {train_path}, {val_path}, {noiseless_train_path}, {noiseless_val_path}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Generate majority datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "idx for sparse function: save these: [ 8 39 14  1  4]\n",
      "Generating sparse_majority_k5_nbits41_n2000_bf0_seed1234 with p_bitflip=0.0\n",
      "idx for sparse function: save these: [37 45  6]\n",
      "Generating sparse_majority_k3_nbits51_n2000_bf0_seed1234 with p_bitflip=0.0\n"
     ]
    }
   ],
   "source": [
    "# Generate data with bitflip values\n",
    "n_val = 10000 # number of validation examples\n",
    "seed = 1234\n",
    "n_train = 2000\n",
    "# variables\n",
    "# p_bitflips = [0.05, 0.15, 0.4, 0.45]\n",
    "p_bitflips = [0.0]\n",
    "nondeterms = [0.0, 0.10, 0.20] # these are the nondeterministic values for the not_majority_4lookback dataset NOT BITFLIP VALUES\n",
    "\n",
    "def sparse_majority_40_5(n_data, p_bitflip, seed, subseq_idx=None):\n",
    "    return make_datasets.sparse_majority_k_n(41, 5, n_data, p_bitflip, seed, subseq_idx=subseq_idx)\n",
    "\n",
    "def sparse_majority_50_3(n_data, p_bitflip, seed, subseq_idx=None):\n",
    "    \"\"\"Wrapper for sparse_majority_k_n with k=3\"\"\"\n",
    "    return make_datasets.sparse_majority_k_n(51, 3, n_data, p_bitflip, seed, subseq_idx=subseq_idx)\n",
    "\n",
    "generators = {\n",
    "    \"sparse_majority_k5\": sparse_majority_40_5,\n",
    "    \"sparse_majority_k3\": sparse_majority_50_3\n",
    "}\n",
    "nbits_list = [41, 51]\n",
    "\n",
    "for i, (gen_name, generator) in enumerate(generators.items()):\n",
    "    for p_bitflip in p_bitflips:\n",
    "        n_bits = nbits_list[i]\n",
    "        idx = None\n",
    "        X, Z, subseq_idx = generator(n_train + n_val, p_bitflip, seed, subseq_idx=idx) # now we keep the subseq_idx\n",
    "        print(\"idx for sparse function: save these:\", subseq_idx)\n",
    "        p100 = int(p_bitflip*100)\n",
    "        suffix = f\"_nbits{n_bits}_n{n_train}_bf{p100}_seed{seed}\"\n",
    "        dirname = gen_name + suffix\n",
    "        print(f\"Generating {dirname} with p_bitflip={p_bitflip}\")\n",
    "\n",
    "        if p_bitflip == 0:\n",
    "            Z = X\n",
    "        Z_train = Z[:n_train]\n",
    "        Z_val = Z[n_train:]\n",
    "\n",
    "        # Check if the data directory exists, if not create it\n",
    "        if not os.path.exists(dirname):\n",
    "            os.makedirs(dirname)\n",
    "\n",
    "        train_path = f\"{dirname}/train.pkl\"\n",
    "        val_path = f\"{dirname}/val.pkl\"\n",
    "        data_io.save_numpy_as_dict(Z_train, train_path)\n",
    "        data_io.save_numpy_as_dict(Z_val, val_path)\n",
    "\n",
    "        X_train = X[:n_train]\n",
    "        X_val = X[n_train:]\n",
    "        noiseless_train_path = f\"{dirname}/noiseless_train.pkl\"\n",
    "        noiseless_val_path = f\"{dirname}/noiseless_val.pkl\"\n",
    "        data_io.save_numpy_as_dict(X_train, noiseless_train_path)\n",
    "        data_io.save_numpy_as_dict(X_val, noiseless_val_path)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating sparse_majority_k4_nbits31_n2000_bf0_seed1234 with p_bitflip=0\n",
      "idx for sparse function: save these: [ 7 10  4  1]\n",
      "Generating sparse_majority_k4_nbits31_n2000_bf1.2_seed1234 with p_bitflip=0.012\n",
      "idx for sparse function: save these: [ 7 10  4  1]\n",
      "Generating sparse_majority_k4_nbits31_n2000_bf3.0_seed1234 with p_bitflip=0.03\n",
      "idx for sparse function: save these: [ 7 10  4  1]\n",
      "Generating sparse_majority_k4_nbits31_n2000_bf5.3_seed1234 with p_bitflip=0.053\n",
      "idx for sparse function: save these: [ 7 10  4  1]\n",
      "Generating sparse_majority_k4_nbits31_n2000_bf8.0_seed1234 with p_bitflip=0.08\n",
      "idx for sparse function: save these: [ 7 10  4  1]\n",
      "Generating sparse_majority_k4_nbits31_n2000_bf11.1_seed1234 with p_bitflip=0.111\n",
      "idx for sparse function: save these: [ 7 10  4  1]\n",
      "Generating sparse_majority_k4_nbits31_n2000_bf14.8_seed1234 with p_bitflip=0.148\n",
      "idx for sparse function: save these: [ 7 10  4  1]\n",
      "Generating sparse_majority_k4_nbits31_n2000_bf19.2_seed1234 with p_bitflip=0.192\n",
      "idx for sparse function: save these: [ 7 10  4  1]\n",
      "Generating sparse_majority_k4_nbits31_n2000_bf24.6_seed1234 with p_bitflip=0.246\n",
      "idx for sparse function: save these: [ 7 10  4  1]\n",
      "Generating sparse_majority_k4_nbits31_n2000_bf31.9_seed1234 with p_bitflip=0.319\n",
      "idx for sparse function: save these: [ 7 10  4  1]\n"
     ]
    }
   ],
   "source": [
    "# Generate data with bitflip values\n",
    "n_val = 10000 # number of validation examples\n",
    "seed = 1234\n",
    "n_train = 5000\n",
    "n_bits = 7 # number of TOTAL bits\n",
    "# variables\n",
    "# p_bitflips = [0.05, 0.15, 0.4, 0.45]\n",
    "p_bitflips = [0.0, 0.10, 0.20]\n",
    "nondeterms = [0.0, 0.10, 0.20] # these are the nondeterministic values for the not_majority_4lookback dataset NOT BITFLIP VALUES\n",
    "\n",
    "\n",
    "# def hamilton_6_choose_4(n_data, n_bits, p_bitflip, seed, **kwargs):\n",
    "#     return make_datasets.k_choose_m_hamilton_forecast_dataset(k=6, m=4, n_data=n_data, n_bits=n_bits, p_bitflip=p_bitflip, seed=seed, **kwargs)\n",
    "\n",
    "def notmajority_nondeterm_k5(n_data, n_bits, nondeterm, seed, **kwargs):\n",
    "    return make_datasets.not_majority_5lookback_nondeterministic(n_data, n_bits, nondeterm, seed)\n",
    "\n",
    "def hamilton_6_choose_6(n_data, n_bits, p_bitflip, seed, **kwargs):\n",
    "    return make_datasets.k_choose_m_hamilton_forecast_dataset(k=6, m=6, n_data=n_data, n_bits=n_bits, p_bitflip=p_bitflip, seed=seed, **kwargs)\n",
    "\n",
    "def sparse_majority_k5(n_data, n_bits, p_bitflip, seed, subseq_idx=None):\n",
    "    return make_datasets.sparse_majority_k_n(n_bits, 5, n_data, p_bitflip, seed, subseq_idx=subseq_idx)\n",
    "\n",
    "def sparse_majority_k3(n_data, n_bits, p_bitflip, seed, subseq_idx=None):\n",
    "    \"\"\"Wrapper for sparse_majority_k_n with k=3\"\"\"\n",
    "    return make_datasets.sparse_majority_k_n(n_bits, 3, n_data, p_bitflip, seed, subseq_idx=subseq_idx)\n",
    "\n",
    "def sparse_majority_k4(n_data, n_bits, p_bitflip, seed, subseq_idx=None):\n",
    "    \"\"\"Wrapper for sparse_majority_k_n with k=4\"\"\"\n",
    "    return make_datasets.sparse_majority_k_n(n_bits, 4, n_data, p_bitflip, seed, subseq_idx=subseq_idx)\n",
    "\n",
    "def sparity_k4(n_data, n_bits, p_bitflip, seed, subseq_idx=None):\n",
    "    \"\"\"Wrapper for sparse_parity_k_n with k=4\"\"\"\n",
    "    return make_datasets.sparse_parity_k_n(n_bits, 4, n_data, p_bitflip, seed, subseq_idx=subseq_idx)\n",
    "\n",
    "\n",
    "generators = {\n",
    "    # \"parity_4lookback\": make_datasets.parity_4lookback,\n",
    "    # \"not_majority_4lookback\": make_datasets.not_majority_4lookback,\n",
    "    # \"sparse_parity_k4\": sparity_k4,\n",
    "    # \"hamilton_6_choose_4\": hamilton_6_choose_4,\n",
    "    \"hamilton_6_choose_6\": hamilton_6_choose_6,\n",
    "    \"not_majority_4lookback\": notmajority_nondeterm_k5\n",
    "    # \"sparse_majority_k5\": sparse_majority_k5\n",
    "\n",
    "}\n",
    "\n",
    "for gen_name, generator in generators.items():\n",
    "    # Use a fixed subseq_idx for all data from the same generator\n",
    "    # This is useful for consistency across different p_bitflip values\n",
    "    subseq_idx = None\n",
    "    if gen_name == \"hamilton_6_choose_4\":\n",
    "        subseq_idx = [2, 3, 4, 5]\n",
    "\n",
    "    nondeterm_list = p_bitflips\n",
    "    if gen_name == \"not_majority_4lookback\":\n",
    "        nondeterm_list = nondeterms\n",
    "    for p_bitflip in nondeterm_list:\n",
    "        p100 = int(p_bitflip*100)\n",
    "        suffix = f\"_nbits{n_bits}_n{n_train}_bf{p100}_seed{seed}\"\n",
    "        dirname = gen_name + suffix\n",
    "        print(f\"Generating {dirname} with p_bitflip={p_bitflip}\")\n",
    "        # If your dataset has a hidden subset, update this list:\n",
    "        idx = None\n",
    "        if subseq_idx is not None:\n",
    "            idx = subseq_idx\n",
    "        X, Z, subseq_idx = generator(n_train + n_val, n_bits, p_bitflip, seed, subseq_idx=idx) # now we keep the subseq_idx\n",
    "        # print(\"idx for sparse function: save these:\", subseq_idx)\n",
    "\n",
    "        if p_bitflip == 0:\n",
    "            Z = X\n",
    "        Z_train = Z[:n_train]\n",
    "        Z_val = Z[n_train:]\n",
    "\n",
    "        # Check if the data directory exists, if not create it\n",
    "        if not os.path.exists(dirname):\n",
    "            os.makedirs(dirname)\n",
    "\n",
    "        train_path = f\"{dirname}/train.pkl\"\n",
    "        val_path = f\"{dirname}/val.pkl\"\n",
    "        data_io.save_numpy_as_dict(Z_train, train_path)\n",
    "        data_io.save_numpy_as_dict(Z_val, val_path)\n",
    "\n",
    "        X_train = X[:n_train]\n",
    "        X_val = X[n_train:]\n",
    "        noiseless_train_path = f\"{dirname}/noiseless_train.pkl\"\n",
    "        noiseless_val_path = f\"{dirname}/noiseless_val.pkl\"\n",
    "        data_io.save_numpy_as_dict(X_train, noiseless_train_path)\n",
    "        data_io.save_numpy_as_dict(X_val, noiseless_val_path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating parity_4lookback_n5000_nondeterm0_seed1234 with nondeterm=0.0\n"
     ]
    },
    {
     "ename": "NotImplementedError",
     "evalue": "output signature is wrong, and seed bits are missing.",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNotImplementedError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[5], line 27\u001b[0m\n\u001b[0;32m     25\u001b[0m dirname \u001b[38;5;241m=\u001b[39m gen_name \u001b[38;5;241m+\u001b[39m suffix\n\u001b[0;32m     26\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGenerating \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdirname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m with nondeterm=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnondeterm\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m---> 27\u001b[0m X, Z, idx \u001b[38;5;241m=\u001b[39m \u001b[43mgenerator\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn_train\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mn_val\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_bits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnondeterm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mseed\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     28\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPAY ATTENTION: idx=\u001b[39m\u001b[38;5;124m\"\u001b[39m, idx)\n\u001b[0;32m     29\u001b[0m X_train \u001b[38;5;241m=\u001b[39m X[:n_train]\n",
      "File \u001b[1;32m~\\Desktop\\projects\\MindReadingAutobot\\mindreadingautobots\\src\\mindreadingautobots\\sequence_generators\\make_datasets.py:258\u001b[0m, in \u001b[0;36mparity_4lookback_nondeterministic\u001b[1;34m(n_data, n_bits, nondeterm, seed)\u001b[0m\n\u001b[0;32m    251\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Generate PARITY forecasting data with n_bits bits.\u001b[39;00m\n\u001b[0;32m    252\u001b[0m \u001b[38;5;124;03m\u001b[39;00m\n\u001b[0;32m    253\u001b[0m \u001b[38;5;124;03mthe `nondeterm` parameter is the probability of bitflipping a bit during \u001b[39;00m\n\u001b[0;32m    254\u001b[0m \u001b[38;5;124;03msequence generation. This is not equivalent to the `p_bitflip` parameter.\u001b[39;00m\n\u001b[0;32m    255\u001b[0m \n\u001b[0;32m    256\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m    257\u001b[0m transition_matrix \u001b[38;5;241m=\u001b[39m {\u001b[38;5;241m0\u001b[39m: nondeterm, \u001b[38;5;241m1\u001b[39m: \u001b[38;5;241m1\u001b[39m \u001b[38;5;241m-\u001b[39m nondeterm, \u001b[38;5;241m2\u001b[39m: nondeterm, \u001b[38;5;241m3\u001b[39m: \u001b[38;5;241m1\u001b[39m \u001b[38;5;241m-\u001b[39m nondeterm, \u001b[38;5;241m4\u001b[39m: nondeterm}\n\u001b[1;32m--> 258\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mk_lookback_weight_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtransition_matrix\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m4\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_data\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_bits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mseed\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32m~\\Desktop\\projects\\MindReadingAutobot\\mindreadingautobots\\src\\mindreadingautobots\\sequence_generators\\make_datasets.py:175\u001b[0m, in \u001b[0;36mk_lookback_weight_dataset\u001b[1;34m(transition_matrix, k, n_data, n_bits, p_bitflip, seed)\u001b[0m\n\u001b[0;32m    156\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mk_lookback_weight_dataset\u001b[39m(transition_matrix, k, n_data, n_bits, p_bitflip, seed):\n\u001b[0;32m    157\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Abstract function for _specific_ k-lookback boolean function of bitstring _weight_.\u001b[39;00m\n\u001b[0;32m    158\u001b[0m \u001b[38;5;124;03m    \u001b[39;00m\n\u001b[0;32m    159\u001b[0m \u001b[38;5;124;03m    The data generation works in 4 steps:\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    173\u001b[0m \u001b[38;5;124;03m        Z: (n_data, n_bits) array of noisy data, or None if p_bitflip is 0\u001b[39;00m\n\u001b[0;32m    174\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m--> 175\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput signature is wrong, and seed bits are missing.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m    177\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(transition_matrix) \u001b[38;5;241m==\u001b[39m k \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m    178\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m np\u001b[38;5;241m.\u001b[39mall([\u001b[38;5;241m0\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m v \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m transition_matrix\u001b[38;5;241m.\u001b[39mvalues()])\n",
      "\u001b[1;31mNotImplementedError\u001b[0m: output signature is wrong, and seed bits are missing."
     ]
    }
   ],
   "source": [
    "# Generate data with bitflip values\n",
    "n_val = 10000 # number of validation examples \n",
    "seed = 1234 \n",
    "n_train = 5000 # number of training examples\n",
    "n_bits = 21 # number of TOTAL bits (including final bit)\n",
    "\n",
    "# Create a different dataset for every 'nondeterministic' value in this list\n",
    "# Note that _sometimes_ this means bitflip rate, but not always\n",
    "nondeterms = [0.0, 0.1, 0.2]\n",
    "\n",
    "generators = {\n",
    "    # \"parity_4lookback\": make_datasets.parity_4lookback_nondeterministic,\n",
    "    \"not_majority_4lookback\": make_datasets.not_majority_4lookback_nondeterministic,\n",
    "}\n",
    "\n",
    "for nondeterm in nondeterms:\n",
    "    p100 = int(nondeterm*100)\n",
    "    suffix = f\"_n{n_train}_nondeterm{p100}_seed{seed}\"\n",
    "\n",
    "    for gen_name, generator in generators.items():\n",
    "        dirname = gen_name + suffix\n",
    "        print(f\"Generating {dirname} with nondeterm={nondeterm}\")\n",
    "        X, Z, idx = generator(n_train + n_val, n_bits, nondeterm, seed)\n",
    "        print(\"PAY ATTENTION: idx=\", idx)\n",
    "        X_train = X[:n_train]\n",
    "        X_val = X[n_train:]\n",
    "\n",
    "        # Check if the data directory exists, if not create it\n",
    "        if not os.path.exists(dirname):\n",
    "            os.makedirs(dirname)\n",
    "\n",
    "        train_path = f\"{dirname}/train.pkl\"\n",
    "        val_path = f\"{dirname}/val.pkl\"\n",
    "        data_io.save_numpy_as_dict(X_train, train_path)\n",
    "        data_io.save_numpy_as_dict(X_val, val_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sparse_parity4a\n",
      "train: 30000 40\n"
     ]
    }
   ],
   "source": [
    "# load \"./sparity40_1k/train.pkl\" \n",
    "# dir_names = ['./sparity40_1k', 'sparity40_5k', 'sparity40_25h', 'sparse_n_parity4a', 'sparse_parity4a']\n",
    "dir_names = ['sparse_parity4a']\n",
    "\n",
    "for dir_name in dir_names:\n",
    "    train_path = f'./{dir_name}/train.pkl'\n",
    "    test_path = f'./{dir_name}/val.pkl'\n",
    "\n",
    "    print(dir_name)\n",
    "    with open(train_path, 'rb') as f:\n",
    "        train = pickle.load(f)\n",
    "    xvals = train.get('line')\n",
    "    n_data = len(xvals)\n",
    "    n_bits = len(xvals[0])\n",
    "    print(\"train:\", n_data, n_bits)\n",
    "\n",
    "    # with open(test_path, 'rb') as f:\n",
    "    #     test = pickle.load(f)\n",
    "    # xvals = test.get('line')\n",
    "    # n_data = len(xvals)\n",
    "    # n_bits = len(xvals[0])\n",
    "    # print(\"val:\", n_data, n_bits)\n",
    "    # print()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "autobots",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
