method: grid
name: lr_figure_3_base
parameters:
  dataset:
    value: single_location_linear_regression
  dataset.burstiness:
    value: 1
  dataset.dimension:
    values:
      - 16
      - 23
      - 32
      - 45
      - 64
      - 91
      - 128
  dataset.iters:
    value: 64
  dataset.p_repeat:
    value: 0
  dataset.random_relevant_token_positions:
    value: true
  dataset.sequence_length:
    values:
      - 32
      - 64
      - 128
      - 256
  dataset.show_relevant_token:
    value: true
  dataset.train_data_size:
    value: 4096
  model:
    value: transformer
  model.embedding_dim:
    value: 256
  model.n_heads:
    value: 4
  model.n_layers:
    value: 2
  program:
    value: sl_lr_learning.py
  run.det_run:
    value: false
  run.random_seed:
    values:
      - 5
      - 6
      - 7
      - 8
      - 9
  run.start_from_scratch:
    value: true
  run.wandb_writer:
    value: true
  training.batch_size:
    value: 32
  training.eval_interval:
    value: 50
  training.iters:
    value: 50000  # 10000 is fine when T < 64 and d < 64
  training.lr:
    value: 0.0001
  training.opt:
    value: adam
  training.save_checkpoint:
    value: false
program: sweeps/run_with_hydra.py