method: grid
name: ar_figure_17
parameters:
  dataset:
    value: associative_recall
  dataset.n_pairs:
    value: 32
  dataset.test_dist_args.burstiness:
    value: 1
  dataset.train_dist_args.burstiness:
    value: 1
  dataset.vocab_size:
    value: 256
  model:
    value: transformer
  model.embedding_dim:
    value: 256
  model.enable_mlp:
    value: true
  model.enable_norm:
    value: true
  model.enable_skip:
    value: true
  model.n_heads:
    value: 4
  model.n_layers:
    value: 4
  model.pos_enc:
    value: sin_cos
  program:
    value: ar_ic_learning.py
  run.det_run:
    value: false
  run.log_token_attention:
    value: true
  run.random_seed:
    value: 6
  run.start_from_scratch:
    value: true
  run.wandb_writer:
    value: true
  training.batch_size:
    value: 32
  training.eval_interval:
    value: 1000
  training.iters:
    value: 250000
  training.lr:
    value: 0.0001
  training.plot_interval:
    value: 10000000000
  training.save_checkpoint:
    value: false
  training.test_data_size:
    value: 4096
  training.train_data_size:
    value: 32768
program: sweeps/run_with_hydra.py