method: grid
name: lr_figure_11
parameters:
  dataset:
    value: single_location_linear_regression
  dataset.burstiness:
    value: 1
  dataset.dimension:
    values:
      - 16
      - 32
      - 64
  dataset.iters:
    value: 64
  dataset.p_repeat:
    value: 0
  dataset.random_relevant_token_positions:
    value: false
  dataset.sequence_length:
    value: 128
  dataset.show_relevant_token:
    value: false
  dataset.train_data_size:
    value: 4096
  model:
    value: transformer
  model.n_heads:
    value: 4
  model.n_layers:
    value: 2
  program:
    value: sl_lr_learning.py
  run.det_run:
    value: false
  run.random_seed:
    values:
      - 20
      - 21
      - 22
      - 23
      - 24
  run.start_from_scratch:
    value: true
  run.wandb_writer:
    value: true
  training.batch_size:
    value: 32
  training.eval_interval:
    value: 25
  training.iters:
    value: 20000
  training.lr:
    value: 0.0001
  training.opt:
    value: adam
  training.save_checkpoint:
    value: true
  training.save_interval:
    value: 25
program: sweeps/run_with_hydra.py