{
  "runtime": {
    "seed": 0,
    "should_train": true,
    "_should_train": false,
    "device": null
  },
  "model": {

    "_class": "mamba_ssm",
    "_variant": "full",

    "class": "mamba_tiny",
    "variant": "simplified",
    "__variant": "simplified_ideal_projections",
  	"____variant": "simplified_ideal_weights",

    "__class": "mamba_linear",
    "______variant": "ideal"
  },

  "grid_options": {
    "shuffle": true
  },

  "_grid": {
    "V": 512,
    "L": 128,
    "N_facts": 32,
    "D": "np.arange(2, 64+1, step=2)",
    "N": "np.arange(2, 32+1, step=1)",
  	"x_axis": "D",
  	"y_axis": "N"
  },

  "__grid": {
    "V": "np.logspace(5, 13, num=32, base=2)",
    "L": "np.linspace(32, 1024, num=32)",
    "N_facts": 32,
    "D": 32,
    "N": 16,
  	"x_axis": "V",
  	"y_axis": "L"
  },

  "___grid": {
    "V": "(np.logspace(6, 12, num=16, base=2).astype(int) // 2) * 2 + 2",
    "L": "(np.linspace(2**6, 2**9, num=16).astype(int) // 2) * 2",
    "N_facts": 16,
    "D": 32,
    "N": 8,
  	"x_axis": "V",
  	"y_axis": "L"
  },

  "____grid": {
    "V": "(np.logspace(7, 12, num=16, base=2).astype(int) // 2) * 2 + 2",
    "L": 128,
    "N_facts": 16,
    "D": "np.arange(2, 64+1, step=2)",
    "N": 16,
  	"x_axis": "D",
  	"y_axis": "V"
  },

  "grid": {
    "V": "(np.logspace(6, 12, num=32, base=2).astype(int) // 2) * 2 + 2",
    "L": 64,
    "N_facts": 16,
    "D": "np.arange(2, 64+1, step=2)",
    "N": null,
    "scale_N_with_D": true,
    "D_to_N_ratio": 4,
  	"x_axis": "D",
  	"y_axis": "V"
  },

  "best_of_n_seeds": {
    "start_seed": 0,
    "n_seeds": 50,
    "threshold_mean_learned_n_facts": 1.5
  },

  "dataset": {

    "V": null,
    "L": null,
    "N_facts": null,

    "power_a": 1.0,
    "random_non_queries": true,

    "____batch_size": 512,
    "___batch_size": 256,
    "_batch_size": 150,
    "batch_size": 128,
    "__batch_size": 64,

    "split_size": {
      "train": null,
      "val": 10000,
      "test": 10000
    }
  },

  "training": {
    "optimizer": {
      "learning_rate": 3e-4,
      "betas": [0.9, 0.95],
      "weight_decay": 0.1
    },
    "scheduler": {
      "num_warmup_steps": 500,
      "num_decay_steps": 15000,
      "decay_factor": 0.1
    },
    "scaling": {
      "learning_rate": "1e-2",
      "_learning_rate": "5e-3",
      "__learning_rate": "3e-4 * np.clip(10 - 9*((np.log2(D) - 5)/4), min=1, max=10)",
      "___learning_rate": "1e-4 * np.clip(10 - 9*((np.log2(D) - 5)/4), min=1, max=10)",
      "____learning_rate": "3e-5 * np.clip(10 - 9*((np.log2(D) - 5)/4), min=1, max=10)",
      "batch_size": null
    },
    "dropout_rate": 0.1,
    "label_smoothing": 0.1,
    "clip_grad": true,
    "__clip_grad_max_norm": 1.5,
    "_clip_grad_max_norm": 1.00,
    "clip_grad_max_norm": 0.75,
    "_grad_accum_steps": 1,
    "grad_accum_steps": 1,
    "use_amp": false,
    "max_num_steps": 500,
    "_max_num_steps": 3000,
    "__max_num_steps": 5000,
    "___max_num_steps": 10000,
    "____max_num_steps": 15000,
    "_____max_num_steps": 25000,
    "______max_num_steps": 100000,
    "evaluate_any_num_steps": 2000,
    "_evaluate_any_num_steps": 250,
    "__evaluate_any_num_steps": 100,
    "threshold_accuracy": 1.00,
    "_threshold_accuracy": 0.995,
    "__threshold_accuracy": 0.99,
    "save_logs_any_num_steps": 100,
    "save_model_at_evaluation": false,
    "view_train_tqdm": true,
    "view_evaluation_tqdm": false
  },
  "parallel": {
	"devices_to_use": null,
  	"_devices_to_use": [0, 1],
	"_____num_processes_per_device": 1,
    "_num_processes_per_device": 5,
	"num_processes_per_device": 8,
    "__num_processes_per_device": 20,
    "___num_1es_per_device": 30,
    "____num_processes_per_device": 32,
    "num_cpu_threads_per_process": 1,
    "_num_cpu_threads_per_process": 3,
	"mp_start_method": "spawn",
	"raise_exceptions": false
  },
  "wandb": {
    "_activate": true,
    "activate": false,
    "verbose": false,
    "log_val": false,
    "log_per_n_results": false,
    "log_grad_flow": false,
    "project_name": "Mamba AR Scaling Laws",
    "run_name": "Run (Default)",
    "cache_dir": "../wandb/cache",
    "output_dir": "../wandb/output"
  },
  "io": {
    "results_dir": "../results",
    "run_results_dir": null,
    "_output_dir": "../output",
    "time_zone": null
  }
}